Last year I wrote a script to separate attachments from e-mails (.eml files). Recently I updated this script to make it more effective and efficient.
The script does the following:
- Check (via Python script) each individual email for non-inline attachments that should be imported into DT. The script ignores e.g.
.ics
attachments orwinmail.dat
attachments - see the Python script for what it ignores. - Convert the e-mail to RTF and check which attachments from the RTF match the found attachments which should be replaced
- Import the found attachments into DT
- Create a JSON string for the replacements with the imported attachment names and reference URLs (
x-devonthink-item
links) - Call the Python script again with the created JSON. The script will strip the found attachments (replace it with empty content) and add an inline HTML part to the e-mail with a list of links to the DT items
The script consists of two parts:
- AppleScript
replace-attachments.scpt
- Python script
replace-attachments.py
To install the script(s):
- Save the AppleScript and Python script and put them in the same directory - check the properties in the top of the script so the scripts can be found
- Make sure you have python3 installed (e.g. using
brew install python3
) and make sure you have the needed modules by usingpip install
if needed (see the top of the Python script).
To use the script(s):
- Create a selection in DT, easiest is to select only e-mails with attachments by searching for
md_attachments>0
- The AppleScript is run on each individual message. This works fast enough for me (e.g. 7min on 1200 messages with attachments on my M1 Pro Macbook)
Tips:
- You can run this script for each individual message so you can also use this in a Smart Rule e.g. when importing (e-mail) messages. Be aware that sometimes Foundation framework doesn’t work in external script, while it mostly seems to work in inline scripts
- The script uses some Foundation framework functions to convert to/from JSON. You could also put these functions in a separate helper script.
Applescript (replace-attachments.scpt
):
use AppleScript version "2.4" -- Yosemite (10.10) or later
use scripting additions
use framework "Foundation"
property ca : a reference to current application
property pythonCmd : "/usr/bin/env python3"
property pythonScriptName : "replace-attachments.py"
property replacedTagName : "attachments-extracted"
tell application "Finder"
set currentPath to POSIX path of ((container of (path to me)) as alias)
set replaceCmd to pythonCmd & " " & quoted form of currentPath & pythonScriptName & " "
end tell
tell application id "DNtp"
set theSelection to the selection
set tmpFolder to path to temporary items
repeat with theRecord in theSelection
-- second repeat loop so we can mimick the behavior of the 'continue'
-- command which doesn't exist in AppleScript
repeat 1 times
if type of theRecord is unknown and path of theRecord ends with ".eml" and (tags of theRecord does not contain replacedTagName) then
set recordPath to path of theRecord
-- check if there are any attachments to replace; otherwise proceeed to next e-mail
set foundAttachmentsJSON to do shell script replaceCmd & (quoted form of recordPath)
if foundAttachmentsJSON is not equal to "" then
set foundAttachments to my fromJSON(foundAttachmentsJSON)
else
exit repeat
end if
-- set details of e-mail to variables
-- (referencing details directly in statements sometimes results in weird errors)
set recordReferenceURL to reference URL of theRecord
set recordSubject to name of theRecord
set recordModificationDate to modification date of theRecord
set recordCreationDate to creation date of theRecord
set recordAdditionDate to addition date of theRecord
set recordGroup to missing value
set extractedAttachments to {}
-- convert the e-mail to RTF format
set rtfRecord to convert record theRecord to rich
try
if type of rtfRecord is rtfd then
set rtfPath to path of rtfRecord
tell text of rtfRecord
if exists attachment in attribute runs then
tell application "Finder"
set rtfAttachmentList to every file in ((POSIX file rtfPath) as alias)
repeat with rtfAttachment in rtfAttachmentList
set rtfAttachmentName to name of rtfAttachment as string
if rtfAttachmentName is in foundAttachments then
-- importing skips files inside record database package, so move record to a temporary folder first
set rtfAttachment to move (rtfAttachment as alias) to tmpFolder with replacing
tell application id "DNtp"
-- create a group if needed
if recordGroup is missing value then
set recordGroup to create record with {name:recordSubject, type:group, creation date:recordCreationDate, modification date:recordModificationDate, addition date:recordAdditionDate} in (parent 1 of theRecord)
end if
-- import the attachment
set importedItem to import (POSIX path of (rtfAttachment as string)) to recordGroup
-- link imported item to original e-mail
set URL of importedItem to recordReferenceURL
-- set dates of importeditem to the original e-mail
set modification date of importedItem to recordModificationDate
set creation date of importedItem to recordCreationDate
-- add this attachment to the list of extracted attachments
set end of extractedAttachments to {rtfAttachmentName, ((reference URL of importedItem) as string)}
log "Found attachment \"" & rtfAttachmentName & "\" to remove from e-mail " & recordSubject
end tell
end if
end repeat
end tell
if (count of extractedAttachments) is greater than 0 then
-- convert list of extracted attachments to JSON
set extractedAttachmentsJSON to my toJSON(extractedAttachments)
tell application id "DNtp"
-- move the e-mail to the group with attachments
move record theRecord to recordGroup
-- run Python script to replace attachments based on given JSON
do shell script replaceCmd & "-r " & quoted form of extractedAttachmentsJSON & " " & quoted form of recordPath
log "Removed attachments from \"" & recordSubject & "\""
-- add a tag so we know this e-mail has been processed
set tags of theRecord to (tags of theRecord) & {replacedTagName}
end tell
end if
end if
end tell
end if
on error error_message number error_number
if error_number is not -128 then display alert "Replace attachments" message error_message as warning
end try
-- remove the temporary record
delete record rtfRecord
end if
end repeat
end repeat
end tell
on fromJSON(strJSON)
set {x, e} to ca's NSJSONSerialization's JSONObjectWithData:((ca's NSString's stringWithString:strJSON)'s dataUsingEncoding:(ca's NSUTF8StringEncoding)) options:0 |error|:(reference)
if x is missing value then error e's localizedDescription() as text
if e ≠missing value then error e
if x's isKindOfClass:(current application's NSDictionary) then
return x as record
else
return x as list
end if
end fromJSON
on toJSON(theData)
set theJSONData to ca's NSJSONSerialization's dataWithJSONObject:theData options:0 |error|:(missing value)
set JSONstr to (ca's NSString's alloc()'s initWithData:theJSONData encoding:(ca's NSUTF8StringEncoding)) as text
return JSONstr
end toJSON
Python (replace-attachments.py
):
#!/usr/bin/env python3
import argparse
import email
from email import policy
import uuid
import logging
import json
# Adapted from https://github.com/Conengmo/emailstripper/blob/master/emailstripper/run_remove_attachments.py
IMAGE_EXTENSIONS = ('.jpg','.jpeg','.png','.gif','.tiff','.tif', '.bmp')
IMAGE_MIN_SIZE_KB = 150
IGNORE_EXTENSIONS = ('.dat','.rtf', '.ics')
IGNORE_ATTACHMENTS = ('winmail.dat','application')
def walk_attachments(filename, replace_dict):
# open file for reading
try:
reader = open(filename, "rb")
except IOError as e:
logging.error("Can't open file {}: {}", filename, e.msg)
# create an EmailMessage object to analyze
msg = email.message_from_binary_file(reader,policy=policy.default)
found_list = []
replace = len(replace_dict) > 0
# find attachments and replace if needed
found_list = walk_over_parts(msg, found_list, filename, replace)
# if attachments are found
if len(found_list) > 0:
logging.info('Found {} attachments to replace in {}'.format(len(found_list), filename))
# only replace if number of found attachments matches number of replacements
# we assume the replacements match the found attachments (not checked)
if replace and len(found_list) == len(replace_dict):
# add replacements in original e-mail
msg.add_attachment(get_replace_text(replace_dict), disposition='inline', subtype="html")
# write replaced content
with open(filename, 'w') as writer:
try:
writer.write(msg.as_string())
except UnicodeEncodeError as e:
logging.error(e.msg)
exit
# if there are no replacements, only output found attachments
elif len(replace_dict) == 0:
print(json.dumps(found_list))
return
# otherwise something went wrong
else:
logging.error("Number of found attachments does not match number of replacements")
return
else:
logging.info("No attachments found to replace")
return
def walk_over_parts(parent, found_list, filename, replace = False):
# we're done if the parent is not a multi-part message
if not parent.is_multipart():
return found_list
# iterate over all pars of the messages
for i, part in enumerate(parent.get_payload()):
# skip plain or html content that isn't an attachment
if part.get_content_type() in ["text/plain", "text/html"] and not part.is_attachment():
continue
# recursively check multipart parts
if part.is_multipart():
found_list = walk_over_parts(part, found_list, filename)
continue
# find size and name of attachment
content_size, attachment_name = parse_attachment(part)
# check if this is something we need to replace
# if we don't check inline attachments part of this statement is superfluous, but we leave it here for clarity
if not (content_size is None or (attachment_name.endswith(IMAGE_EXTENSIONS) and content_size < (IMAGE_MIN_SIZE_KB * 1e3)) or (attachment_name.endswith(IGNORE_EXTENSIONS))):
if replace:
logging.info('Removing attachment {} with size {:.0f} kB.'.format(attachment_name, content_size / 1e3))
payload = parent.get_payload()
# clear the content from the attachment
# payload.pop(i) does not work in tests, so this also is OK
payload[i].set_content("")
parent.set_payload(payload)
# append attachment to list of found items
found_list.append(attachment_name)
return found_list
def parse_attachment(part):
# only get real attachments - add 'inline' if you also want inline attachments
if not part.get_content_disposition() in ['attachment']:
return None, None
# try to get attachment name
attachment_name = part.get_filename()
# try to get attachment name via default method, otherwise skip
if attachment_name is None:
attachment_name = create_default_name(part)
if attachment_name is None:
return None, None
# do not consider inline images as relevant (this might be superfluous)
if attachment_name.endswith(IMAGE_EXTENSIONS) and part.get_content_disposition == "inline":
return None, None
# skip IGNORE_ATTACHMENTS
elif attachment_name in IGNORE_ATTACHMENTS:
return None, None
# calculate attachment size (to ignore too small attachments)
content = part.get_payload()
assert type(content) is str
# https://stackoverflow.com/questions/11761889/get-image-file-size-from-base64-string
content_size = (len(content) * 3) / 4 - content.count('=', -2)
return content_size, attachment_name
""" Create a default name for a part"""
def create_default_name(part):
for tup in part._headers:
if tup[0] == 'Content-Type':
"""tup[1][6:] extracts 'png' from 'image/png' for example. Sometimes the value is image/x-png...
Somehow, the 'x-' doesn't pose a problem. Not sure how it gets removed."""
return part.get_content_disposition() + '-' + str(uuid.uuid4()) + '.' + tup[1][6:]
""" Create HTML for replacement text"""
def get_replace_text(found_list):
replace_text = ""
for item in found_list:
replace_text = "\n\n<li><a href='{}?reveal=1'>{}</a></li>\r\n".format(item[1], item[0]) + replace_text
return "<html><body style='font-family: helvetica; font-size: large;'><br/><br/><hr><p><strong>Attachments:</strong><ul>{}</ul><p></body></html>".format(replace_text)
if __name__ == '__main__':
# set logging configuration
logging.basicConfig(level = logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
# parse arguments
parser = argparse.ArgumentParser(description='Replace attachments')
parser.add_argument('filename',type=str, help='.eml file to parse')
parser.add_argument('-r',dest='replace', help='replace found attachments with DEVONthink links')
args = parser.parse_args()
# only process .eml files
replace_dict = {}
if args.filename.endswith('.eml'):
# check if we need to replace (otherwise found attachments are just printed)
if args.replace:
try:
replace_dict = json.loads(args.replace)
except ValueError as e:
logging.error("JSON error: {}", e.msg)
walk_attachments(args.filename, replace_dict)
else:
logging.error("Filename needs to end with .eml")