Separate/import e-mail attachments for better search V2

OK, made some good progress! The code now seems to work, but I’m not proud of it. :slight_smile: It’s quite messy, some functions, I mean handlers (thank you for that information @chrillek ) are not being used, etc., and maybe there are some edge cases that are not handled properly (for instance when there are attachments with similar filenames…).

All that seems to confirm that an approach with two languages is less robust: many errors came from a different handling of filenames of attachments between DT and Python. Therefore I can only hope @stratadata ’s solution works for me, or that you and many others will massively support my feature request and the developers will find it reasonable to implement:

What is still not working (but I can live with that for now), is the handling of some text attachments of email error messages. The python routine gives them unique names like
attachment-1ff2eeb1-45bc-4516-bc58-28896852dcda.fc822-headers
whereas DT gives them nice compact names like
text.rfc822-headers

Now, without further ado, here’s the code that works for me. First the AppleScript:

use AppleScript version "2.4"
use scripting additions
use framework "Foundation"

property ca : a reference to current application
property pythonCmd : "/usr/bin/env python3"
property replacedTagName : "attachments-extracted"
property notReplacedTagName : "no-attachments-extracted"

tell application "System Events"
	set scriptPath to path of (path to me)
	set parentFolder to POSIX path of (container of file scriptPath)
end tell

set pythonScriptPath to parentFolder & "/replace-attachments.py"

tell application "Finder"
	set replaceCmd to pythonCmd & " " & quoted form of pythonScriptPath & " "
end tell

tell application id "DNtp"
	set theSelection to the selection
	set tmpFolder to path to temporary items
	
	repeat with theRecord in theSelection
		repeat 1 times
			-- display dialog "Verarbeite: " & (name of theRecord)
			
			set recordPath to path of theRecord
			-- display dialog "Pfad: " & recordPath & return & "Typ: " & (type of theRecord as rich text) & return & "Tags: " & (tags of theRecord as rich text)
			
			if (type of theRecord is email or type of theRecord is unknown) and recordPath ends with ".eml" and (tags of theRecord does not contain replacedTagName) then
				try
					set foundAttachmentsJSON to do shell script replaceCmd & (quoted form of recordPath)
				on error errMsg
					display dialog "Fehler beim Python-Skript:" & return & errMsg
					exit repeat
				end try
				
				if foundAttachmentsJSON is equal to "" then
					log "Keine Anhänge vom Python-Skript erkannt."
					set tags of theRecord to (tags of theRecord) & {notReplacedTagName}
					exit repeat
				end if
				
				set foundAttachments to my fromJSON(foundAttachmentsJSON)
				-- display dialog "Gefundene Anhänge: " & (foundAttachments as rich text)
				
				set recordReferenceURL to reference URL of theRecord
				set recordSubject to name of theRecord
				set recordModificationDate to modification date of theRecord
				set recordCreationDate to creation date of theRecord
				set recordAdditionDate to addition date of theRecord
				set recordGroup to missing value
				set extractedAttachments to {}
				
				set rtfRecord to convert record theRecord to rich
				log "RTF-Konvertierungstyp: " & (type of rtfRecord as rich text)
				
				if type of rtfRecord is RTFD then
					set rtfPath to path of rtfRecord
					
					tell rich text of rtfRecord
						tell application "Finder"
							
							try
								tell application "System Events"
									-- Work directly with the path as a disk item
									set rtfFolder to disk item rtfPath
									set rtfAttachmentList to files of rtfFolder
								end tell
							on error errMsg number errNum
								display dialog "Error " & errNum & ": " & errMsg
							end try
							
							log "Anzahl Dateien im RTF: " & (count of rtfAttachmentList)
							
							repeat with rtfAttachment in rtfAttachmentList
								set rtfAttachmentName to name of rtfAttachment as string
								-- display dialog "Datei im RTF: " & rtfAttachmentName
								-- display dialog "Vergleiche:" & return & "RTF-Datei: " & rtfAttachmentName & return & "JSON-Anhänge: " & (foundAttachments as text) & return & "RTF (klein): " & my lowercaseText(rtfAttachmentName)
								-- Replace the existing matching logic with this:
								set nameFound to false
								repeat with itemName in foundAttachments
									set normalizedItem to my normalizeText(itemName)
									set normalizedRTF to my normalizeText(rtfAttachmentName)
									log "normalizedItem: " & normalizedItem
									log "normalizedRTF: " & normalizedRTF
									
									-- Direct match
									if normalizedRTF = normalizedItem then
										set nameFound to true
										exit repeat
									end if
									
									-- Check if RTF filename starts with the expected name + "."
									-- This handles cases like "IMG_7871.jpeg.jpg" matching "IMG_7871.jpeg"
									if normalizedRTF starts with (normalizedItem & ".") then
										set nameFound to true
										exit repeat
									end if
									
									-- Enhanced fuzzy matching for filename variations
									if my fuzzyFilenameMatch(normalizedRTF, normalizedItem) then
										set nameFound to true
										exit repeat
									end if
								end repeat
								
								
								if nameFound then
									-- Retry logic for file moving with shell command (more reliable than Finder)
									set moveSuccessful to false
									repeat with retryCount from 1 to 3
										try
											-- Get POSIX path before moving
											set sourcePath to POSIX path of (rtfAttachment as alias)
											set fileName to name of (rtfAttachment as alias)
											set targetPath to (POSIX path of tmpFolder) & fileName
											
											-- Use shell command to move file (more reliable than Finder)
											do shell script "mv " & quoted form of sourcePath & " " & quoted form of targetPath
											
											-- If we get here, the move was successful
											set moveSuccessful to true
											exit repeat
											
										on error errMsg number errNum
											if retryCount = 3 then
												display dialog "Failed to move file after 3 attempts: " & errMsg & " (Error " & errNum & ")"
												exit repeat
											else
												log "Move attempt " & retryCount & " failed, retrying in 1 second: " & errMsg
												delay 1
											end if
										end try
									end repeat
									
									-- Only proceed with import if file move was successful
									if moveSuccessful then
										tell application id "DNtp"
											if recordGroup is missing value then
												set recordGroup to create record with {name:recordSubject, type:group, creation date:recordCreationDate, modification date:recordModificationDate, addition date:recordAdditionDate} in (parent 1 of theRecord)
											end if
											
											set movedPath to POSIX path of (POSIX file targetPath)
											set importedItem to import path movedPath to recordGroup
											set URL of importedItem to recordReferenceURL
											set modification date of importedItem to recordModificationDate
											set creation date of importedItem to recordCreationDate
											
											-- IMPORTANT: Use the original filename from foundAttachments, not rtfAttachmentName
											set originalFileName to my getMatchingOriginalName(rtfAttachmentName, foundAttachments)
											
											-- Check if this original filename is already in extractedAttachments
											set alreadyExtracted to false
											repeat with existingItem in extractedAttachments
												if item 1 of existingItem = originalFileName then
													set alreadyExtracted to true
													log "Skipping duplicate attachment: " & originalFileName
													exit repeat
												end if
											end repeat
											
											-- Only add if not already extracted
											if not alreadyExtracted then
												set end of extractedAttachments to {originalFileName, ((reference URL of importedItem) as string)}
												log "Successfully imported: " & originalFileName
											else
												log "Duplicate attachment skipped: " & originalFileName
											end if
											
											log "Successfully imported: " & originalFileName
										end tell
									else
										log "Skipping import due to failed file move for: " & rtfAttachmentName
									end if
								end if
							end repeat
						end tell
						
						log "count of extractedAttachments: " & (count of extractedAttachments)
						if (count of extractedAttachments) > 0 then
							set extractedAttachmentsJSON to my toJSON(extractedAttachments)
							log "Original JSON: " & extractedAttachmentsJSON
							
							-- Clean the JSON before base64 encoding
							set cleanJSON to my cleanJSONString(extractedAttachmentsJSON)
							log "Cleaned JSON: " & cleanJSON
							
							-- Base64 encode using printf (more reliable)
							set b64JSON to do shell script "printf '%s' " & quoted form of cleanJSON & " | base64 | tr -d '\\n'"
							
							
							-- Remove any newlines from base64 output (important!)
							set b64JSON to do shell script "echo " & quoted form of b64JSON & " | tr -d '\\n'"
							
							log "Base64 encoded: " & b64JSON
							
							tell application id "DNtp"
								move record theRecord to recordGroup
								
								-- Construct command more explicitly
								set pythonPath to quoted form of pythonScriptPath
								set b64Param to quoted form of b64JSON
								set emlPath to quoted form of recordPath
								set fullCommand to pythonCmd & " " & pythonPath & " -b " & b64Param & " " & emlPath
								
								log "Executing command: " & fullCommand
								do shell script fullCommand
								
								set tags of theRecord to (tags of theRecord) & {replacedTagName}
							end tell
						end if
						
						
					end tell
					
					delete record rtfRecord
				else
					display dialog "RTF-Konvertierung hat kein RTFD geliefert."
				end if
			end if
		end repeat
	end repeat
end tell

on normalizeText(t)
	-- Entfernt führende/trailing Whitespace und wandelt in Kleinbuchstaben
	set cleaned to do shell script "/bin/echo " & quoted form of t & " | tr '[:upper:]' '[:lower:]' | sed 's/^ *//;s/ *$//'"
	return cleaned
end normalizeText

on fromJSON(strJSON)
	set {x, e} to ca's NSJSONSerialization's JSONObjectWithData:((ca's NSString's stringWithString:strJSON)'s dataUsingEncoding:(ca's NSUTF8StringEncoding)) options:0 |error|:(reference)
	if x is missing value then error e's localizedDescription() as text
	if e ≠ missing value then error e
	if x's isKindOfClass:(ca's NSDictionary) then
		return x as record
	else
		return x as list
	end if
end fromJSON

on toJSON(theData)
	set theJSONData to ca's NSJSONSerialization's dataWithJSONObject:theData options:0 |error|:(missing value)
	set JSONstr to (ca's NSString's alloc()'s initWithData:theJSONData encoding:(ca's NSUTF8StringEncoding)) as text
	return JSONstr
end toJSON

on lowercaseText(t)
	return (do shell script "/bin/echo " & quoted form of t & " | tr '[:upper:]' '[:lower:]'")
end lowercaseText

on lowercaseList(theList)
	set outList to {}
	repeat with i in theList
		set end of outList to my lowercaseText(i)
	end repeat
	return outList
end lowercaseList


on fuzzyFilenameMatch(rtfName, originalName)
	-- Remove extensions first
	set rtfBase to my removeFileExtensions(rtfName)
	set originalBase to my removeFileExtensions(originalName)
	
	-- Clean up leading/trailing quotes and spaces
	set rtfCleaned to my cleanLeadingTrailing(rtfBase)
	set originalCleaned to my cleanLeadingTrailing(originalBase)
	
	log "Debug - Original cleaned: " & originalCleaned
	log "Debug - RTF cleaned: " & rtfCleaned
	
	-- Direct match after cleaning
	if rtfCleaned = originalCleaned then
		return true
	end if
	
	-- Check if RTF name is a truncated version of the original
	if my isTruncatedMatch(rtfCleaned, originalCleaned) then
		log "Debug - Truncated match found!"
		return true
	end if
	
	-- Apply time format normalization
	set normalizedOriginal to my normalizeTimeFormat(originalCleaned)
	if rtfCleaned = normalizedOriginal then
		return true
	end if
	
	-- Check truncated match with time normalization
	if my isTruncatedMatch(rtfCleaned, normalizedOriginal) then
		return true
	end if
	
	-- Handle other character replacements
	set variations to {}
	set end of variations to originalCleaned
	set end of variations to normalizedOriginal
	
	if originalCleaned contains "&" then
		set ampersandVariation to my replaceText(originalCleaned, "&", "-")
		set end of variations to ampersandVariation
		set end of variations to my normalizeTimeFormat(ampersandVariation)
	end if
	
	-- Check all variations against both exact and truncated matches
	repeat with variation in variations
		set varStr to variation as string
		if rtfCleaned = varStr or my isTruncatedMatch(rtfCleaned, varStr) then
			return true
		end if
	end repeat
	
	return false
end fuzzyFilenameMatch

on removeFileExtensions(fileName)
	-- Remove common double extensions like .pdf.pdf, .jpg.jpg, etc.
	set cleanName to fileName
	
	-- Remove extensions iteratively (handles multiple extensions)
	repeat 3 times -- max 3 extensions deep
		set originalLength to length of cleanName
		set extensionRemoved to false
		
		-- Check each extension manually
		if cleanName ends with ".pdf" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".jpeg" then
			set cleanName to text 1 thru ((length of cleanName) - 5) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".jpg" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".png" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".gif" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".docx" then
			set cleanName to text 1 thru ((length of cleanName) - 5) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".doc" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".xlsx" then
			set cleanName to text 1 thru ((length of cleanName) - 5) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".xls" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".txt" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		else if cleanName ends with ".rtf" then
			set cleanName to text 1 thru ((length of cleanName) - 4) of cleanName
			set extensionRemoved to true
		end if
		
		-- If no extension was removed, stop
		if not extensionRemoved then exit repeat
	end repeat
	
	-- Remove trailing quotes and special characters
	if cleanName ends with "'" then
		set cleanName to text 1 thru ((length of cleanName) - 1) of cleanName
	end if
	
	return cleanName
end removeFileExtensions


on replaceText(originalText, searchString, replacementString)
	set AppleScript's text item delimiters to searchString
	set textItems to text items of originalText
	set AppleScript's text item delimiters to replacementString
	set newText to textItems as string
	set AppleScript's text item delimiters to ""
	return newText
end replaceText

on cleanFilename(fileName)
	-- Replace problematic characters that DEVONthink might convert
	set cleanName to fileName
	
	-- Common character replacements
	set cleanName to my replaceText(cleanName, "&", "-")
	set cleanName to my replaceText(cleanName, " & ", " - ")
	set cleanName to my replaceText(cleanName, "/", "-")
	set cleanName to my replaceText(cleanName, ":", "-")
	
	-- Remove or replace other special characters as needed
	-- Add more replacements based on patterns you observe
	
	return cleanName
end cleanFilename

on normalizeTimeFormat(fileName)
	set normalizedName to fileName
	
	-- Handle the specific pattern: XXhYY -> XXYY
	-- Look for patterns like 17h58, 9h30, etc.
	repeat
		set foundPattern to false
		set textLength to length of normalizedName
		
		-- Find "h" surrounded by digits
		repeat with i from 2 to (textLength - 2)
			if character i of normalizedName = "h" then
				set beforeH to character (i - 1) of normalizedName
				set afterH to character (i + 1) of normalizedName
				
				-- Check if surrounded by digits
				if beforeH ≥ "0" and beforeH ≤ "9" and afterH ≥ "0" and afterH ≤ "9" then
					-- Remove the "h"
					set normalizedName to (text 1 thru (i - 1) of normalizedName) & (text (i + 1) thru textLength of normalizedName)
					set foundPattern to true
					exit repeat
				end if
			end if
		end repeat
		
		if not foundPattern then exit repeat
	end repeat
	
	-- Also remove colons in time patterns
	set normalizedName to my replaceText(normalizedName, ":", "")
	
	return normalizedName
end normalizeTimeFormat


on replaceTextWithRegex(originalText, pattern, replacement)
	-- Simple regex-like replacements for common patterns
	set resultText to originalText
	
	-- Handle XXhYY pattern specifically
	if pattern = "([0-9]{1,2})h([0-9]{2})" then
		-- Look for patterns like 17h58, 9h30, etc.
		repeat
			set foundPattern to false
			set textLength to length of resultText
			
			repeat with i from 1 to (textLength - 3)
				set char1 to character i of resultText
				set char2 to character (i + 1) of resultText
				set char3 to character (i + 2) of resultText
				set char4 to character (i + 3) of resultText
				
				-- Check for single digit hour (9h58)
				if char1 ≥ "0" and char1 ≤ "9" and char2 = "h" and char3 ≥ "0" and char3 ≤ "9" and char4 ≥ "0" and char4 ≤ "9" then
					set resultText to (text 1 thru (i - 1) of resultText) & char1 & char3 & char4 & (text (i + 4) thru textLength of resultText)
					set foundPattern to true
					exit repeat
				end if
				
				-- Check for double digit hour (17h58) - need to check if we have enough characters
				if i ≤ (textLength - 4) then
					set char5 to character (i + 4) of resultText
					if char1 ≥ "0" and char1 ≤ "9" and char2 ≥ "0" and char2 ≤ "9" and char3 = "h" and char4 ≥ "0" and char4 ≤ "9" and char5 ≥ "0" and char5 ≤ "9" then
						set resultText to (text 1 thru (i - 1) of resultText) & char1 & char2 & char4 & char5 & (text (i + 5) thru textLength of resultText)
						set foundPattern to true
						exit repeat
					end if
				end if
			end repeat
			
			if not foundPattern then exit repeat
		end repeat
	end if
	
	return resultText
end replaceTextWithRegex

on characterSimilarityMatch(text1, text2)
	-- Check if two strings are very similar (allowing for small character differences)
	set len1 to length of text1
	set len2 to length of text2
	
	-- If length difference is too large, not a match
	if (len1 - len2) > 3 or (len2 - len1) > 3 then return false
	
	-- Count character differences
	set maxLength to len1
	if len2 > maxLength then set maxLength to len2
	
	set differences to 0
	set i to 1
	set j to 1
	
	repeat while i ≤ len1 and j ≤ len2
		if i > len1 then
			set differences to differences + (len2 - j + 1)
			exit repeat
		else if j > len2 then
			set differences to differences + (len1 - i + 1)
			exit repeat
		else if character i of text1 ≠ character j of text2 then
			set differences to differences + 1
		end if
		set i to i + 1
		set j to j + 1
	end repeat
	
	-- Allow up to 2 character differences for files with more than 10 characters
	if maxLength > 10 and differences ≤ 2 then return true
	if maxLength ≤ 10 and differences ≤ 1 then return true
	
	return false
end characterSimilarityMatch

on cleanLeadingTrailing(text)
	set cleanText to text
	
	-- Remove leading quotes and spaces
	repeat while cleanText starts with "'" or cleanText starts with "\"" or cleanText starts with " "
		if length of cleanText > 1 then
			set cleanText to text 2 thru -1 of cleanText
		else
			set cleanText to ""
			exit repeat
		end if
	end repeat
	
	-- Remove trailing quotes and spaces
	repeat while cleanText ends with "'" or cleanText ends with "\"" or cleanText ends with " "
		if length of cleanText > 1 then
			set cleanText to text 1 thru -2 of cleanText
		else
			set cleanText to ""
			exit repeat
		end if
	end repeat
	
	return cleanText
end cleanLeadingTrailing

on isTruncatedMatch(shortName, longName)
	-- Check if shortName is a truncated version of longName
	set shortLength to length of shortName
	set longLength to length of longName
	
	-- Short name must be shorter
	if shortLength ≥ longLength then return false
	
	-- Short name must be at least 10 characters to avoid false positives
	if shortLength < 10 then return false
	
	-- Check if the long name starts with the short name
	if longName starts with shortName then
		return true
	end if
	
	-- Check if they match up to a logical break point (like semicolon, comma, or space)
	-- Find where they diverge
	repeat with i from 1 to shortLength
		if character i of shortName ≠ character i of longName then
			return false
		end if
	end repeat
	
	-- If we get here, shortName matches the beginning of longName
	-- Check if the break happens at a logical point
	if shortLength < longLength then
		set nextChar to character (shortLength + 1) of longName
		-- Allow truncation at logical break points
		if nextChar is in {";", ",", " ", ".", ":", "-"} then
			return true
		end if
	end if
	
	return true
end isTruncatedMatch

on cleanJSONString(JSONstr)
	set cleanStr to JSONstr
	
	-- Remove problematic leading quotes from filenames in the JSON
	-- Look for patterns like ["'filename and replace with ["filename
	repeat
		set oldStr to cleanStr
		set cleanStr to my replaceText(cleanStr, "[\"'", "[\"")
		if cleanStr = oldStr then exit repeat
	end repeat
	
	-- Also handle cases where the quote is after the opening quote
	repeat
		set oldStr to cleanStr
		set cleanStr to my replaceText(cleanStr, "\"'", "\"")
		if cleanStr = oldStr then exit repeat
	end repeat
	
	return cleanStr
end cleanJSONString

on getMatchingOriginalName(rtfName, originalList)
	-- Find which original filename this RTF name matches
	repeat with originalName in originalList
		if my fuzzyFilenameMatch(rtfName, originalName) then
			return originalName as string
		end if
	end repeat
	
	-- Fallback to RTF name if no match found (shouldn't happen)
	log "Warning: No original filename match found for RTF name: " & rtfName
	return rtfName
end getMatchingOriginalName



Edit: added a small retry logic that avoids this error:

--> error number -10010
Ergebnis:
error "„Finder“ hat einen Fehler erhalten: Die Routine kann Objekte dieser Klasse nicht bearbeiten." number -10010

Since the RTFD file has just been imported into DT, DT might still be blocking it when the script tries to access it.

Edit: added some base64 encoding, also in the Python script, to avoid encoding problems with weird filenames. For instance I had one starting with ’ (quote), that didn’t go too well with the previous version.

And here’s the updated PythonScript

#!/usr/bin/env python3
import argparse
import email
from email import policy
import uuid
import logging
import json
import base64

# Adapted from https://github.com/Conengmo/emailstripper/blob/master/emailstripper/run_remove_attachments.py
IMAGE_EXTENSIONS = ('.jpg','.jpeg','.png','.gif','.tiff','.tif', '.bmp')
IMAGE_MIN_SIZE_KB = 100
IGNORE_EXTENSIONS = ('.dat','.rtf', '.ics')
IGNORE_ATTACHMENTS = ('winmail.dat','application')

def walk_attachments(filename, replace_dict):
    
    # open file for reading
    try:
        reader = open(filename, "rb")
    except IOError as e:
        logging.error("Can't open file %s: %s", filename, str(e))    
        return

    # create an EmailMessage object to analyze
    msg = email.message_from_binary_file(reader,policy=policy.default)

    found_list = []
    replace = len(replace_dict) > 0

    # find attachments and replace if needed
    found_list = walk_over_parts(msg, found_list, filename, replace)
    
    # if attachments are found
    if len(found_list) > 0:
        logging.info('Found %d attachments to replace in %s', len(found_list), filename)
        
        # only replace if number of found attachments matches number of replacements
        # we assume the replacements match the found attachments (not checked) 
        if replace and len(found_list) == len(replace_dict):

            # add replacements in original e-mail
            msg.add_attachment(get_replace_text(replace_dict), disposition='inline', subtype="html")
    
            # write replaced content
            with open(filename, 'w') as writer:
                try:
                    writer.write(msg.as_string())
                except UnicodeEncodeError as e:
                    logging.error("Unicode error: %s", str(e))
                    return

        # if there are no replacements, only output found attachments
        elif len(replace_dict) == 0: 
            print(json.dumps(found_list))
            return

        # otherwise something went wrong
        else:
            logging.error("Number of found attachments does not match number of replacements")
            return
    else:
        logging.info("No attachments found to replace")
        return
                
def walk_over_parts(parent, found_list, filename, replace = False):

    # we're done if the parent is not a multi-part message
    if not parent.is_multipart():
        return found_list
    
    # iterate over all pars of the messages    
    for i, part in enumerate(parent.get_payload()):

        # skip plain or html content that isn't an attachment
        if part.get_content_type() in ["text/plain", "text/html"] and not part.is_attachment():
            continue

        # recursively check multipart parts
        if part.is_multipart():
            found_list = walk_over_parts(part, found_list, filename, replace)
            continue

        # find size and name of attachment
        content_size, attachment_name = parse_attachment(part)

        # check if this is something we need to replace
        # if we don't check inline attachments part of this statement is superfluous, but we leave it here for clarity 
        if not (content_size is None or (attachment_name.endswith(IMAGE_EXTENSIONS) and content_size < (IMAGE_MIN_SIZE_KB * 1e3)) or (attachment_name.endswith(IGNORE_EXTENSIONS))):
            
            if replace:
                logging.info('Removing attachment %s with size %.0f kB.', attachment_name, content_size / 1e3)
                payload = parent.get_payload()
                # clear the content from the attachment
                # payload.pop(i) does not work in tests, so this also is OK
                payload[i].set_content("")
                parent.set_payload(payload)
            
            # append attachment to list of found items
            found_list.append(attachment_name)       
   
    return found_list


def parse_attachment(part):
    # only get real attachments - add 'inline' if you also want inline attachments
    if not part.get_content_disposition() in ['attachment', 'inline']:
        return None, None

    # try to get attachment name
    attachment_name = part.get_filename()

    # try to get attachment name via default method, otherwise skip
    if attachment_name is None:
        attachment_name = create_default_name(part)
    if attachment_name is None:
        return None, None

    # do not consider inline images as relevant (this might be superfluous)
    if attachment_name.endswith(IMAGE_EXTENSIONS) and part.get_content_disposition == "inline":
        return None, None
    # skip IGNORE_ATTACHMENTS
    elif attachment_name in IGNORE_ATTACHMENTS:
        return None, None
     
    # calculate attachment size (to ignore too small attachments)
    content = part.get_payload()
    assert type(content) is str
    # https://stackoverflow.com/questions/11761889/get-image-file-size-from-base64-string
    content_size = (len(content) * 3) / 4 - content.count('=', -2)

    return content_size, attachment_name

""" Create a default name for a part"""
def create_default_name(part):
    for tup in part._headers:
        if tup[0] == 'Content-Type':
            """tup[1][6:] extracts 'png' from 'image/png' for example. Sometimes the value is image/x-png...
               Somehow, the 'x-' doesn't pose a problem. Not sure how it gets removed."""
            return part.get_content_disposition() + '-' + str(uuid.uuid4()) + '.' + tup[1][6:]

""" Create HTML for replacement text"""
def get_replace_text(found_list):
    replace_text = ""
    for item in found_list:
        replace_text = "\n\n<li><a href='{}?reveal=1'>{}</a></li>\r\n".format(item[1], item[0]) + replace_text
    return "<html><body style='font-family: helvetica; font-size: large;'><br/><br/><hr><p><strong>Attachments:</strong><ul>{}</ul><p></body></html>".format(replace_text)


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    parser = argparse.ArgumentParser(description='Replace attachments')
    parser.add_argument('filename', help='.eml file to parse')
    parser.add_argument('-r', dest='replace', help='JSON string of replacements')
    parser.add_argument('-b', dest='b64', help='base64-encoded JSON of replacements')
    args = parser.parse_args()

    # load the replace-dict from either -r or -b
    replace_dict = {}
    if args.replace:
        try:
            replace_dict = json.loads(args.replace)
        except json.JSONDecodeError as e:
            logging.error("JSON error: %s", e)
            exit(1)
    elif args.b64:
        try:
            raw = base64.b64decode(args.b64)
            replace_dict = json.loads(raw.decode('utf-8'))
        except Exception as e:
            logging.error("Base64/JSON decode error: %s", e)
            exit(1)

    walk_attachments(args.filename, replace_dict)

What an adventure! Hopefully it’s going to be useful for someone.

Again, apologies for the dirty code.

All the best!