I’m working on expanding this script to be able to do something which has been asked in the forums sometimes: being able to replace attachments with DEVONthink links and index the attachments as ‘proper’ DT records. Wondering if there would be any tips or things to be aware of @cgrunenberg? I still have to test this on a larger archive, but so far it seems to be working.
I’m doing the following:
- Adding a function to the AppleScript above which writes the names + reference URLs of the attachment to the .eml file on disk in the form of a Finder comment (see below for the updated script)
- Running a Python script which goes through the .eml files on disk, strips the attachments and replaces them with the filenames + URLs found in the Finder comment of the .eml file
This is the Python script used (be sure to install xattr
and bpylist
:
import email.mime.text
from email import message_from_file
import os
import re
import uuid
import xattr
from bpylist import bplist
def main(path, filename=None):
"""Extract, store and remove attachments from all or a single mbox file in path."""
iterator = [filename] if filename is not None else os.listdir(path)
for filename in iterator:
count = 0
if filename.endswith('.eml'):
count_before = count
f = open(os.path.join(path, filename))
msg = message_from_file(f)
count = walk_over_parts(msg, count, path, filename)
if count > count_before:
print(msg)
print('Removed {} attachments from {}.'.format(count, filename))
def walk_over_parts(parent, count, path, filename):
"""Walk over the parts of a parent and try to remove attachments.
This function works recursive. So parent is a message, or a part of a message, or a subpart of a part, etc.
"""
if not parent.is_multipart():
return count
for i, part in enumerate(parent.get_payload()):
if part.get_content_type() in ["text/plain", "text/html"]:
continue
if part.is_multipart():
count = walk_over_parts(part, count, path, filename)
continue
content_size, attachment_name = parse_attachment(part)
if content_size is not None and content_size > 1e3:
print('Removing attachment {} with size {:.0f} kB.'.format(attachment_name, content_size / 1e3))
payload = parent.get_payload()
comment = bplist.parse(xattr.getxattr(os.path.join(path, filename), 'com.apple.metadata:kMDItemFinderComment')).rstrip("|")
payload[i] = get_replace_text(comment)
parent.set_payload(payload)
count += 1
return count
def parse_attachment(part):
"""Parse the message part and find whether it's an attachment."""
if not part.get_content_disposition() in ['inline', 'attachment']:
return None, None
attachment_name = part.get_filename()
if attachment_name is None:
attachment_name = create_default_name(part)
if attachment_name is None:
return None, None
content = part.get_payload()
assert type(content) is str
content_size = len(content)
return content_size, attachment_name
def create_default_name(part):
for tup in part._headers:
if tup[0] == 'Content-Type':
"""tup[1][6:] extracts 'png' from 'image/png' for example. Sometimes the value is image/x-png...
Somehow, the 'x-' doesn't pose a problem. Not sure how it gets removed."""
return part.get_content_disposition() + '-' + str(uuid.uuid4()) + '.' + tup[1][6:]
def get_replace_text(comment):
"""Return a message object to replace an attachment with."""
replace_text = ""
attachments = comment.split("|")
for attachment in attachments:
parts = attachment.split(";")
print(parts)
filename = parts[0]
link = parts[1]
replace_text = "\n\n<li><a href='{}'>{}</a></li>\r\n".format(link, filename) + replace_text
return email.mime.text.MIMEText("<br/><br/><hr><br/><strong>Attachments:</strong><ul>{}</ul>".format(replace_text), 'html')
if __name__ == '__main__':
main(path=/Users/yourname/E-mail/archive')
And this is the updated Applescript:
-- Import attachments of selected emails
property currentCount : 0
tell application id "DNtp"
set theSelection to the selection
set tmpFolder to path to temporary items
set tmpPath to POSIX path of tmpFolder
with timeout of 14400 seconds
repeat with theRecord in theSelection
set currentCount to currentCount + 1
if type of theRecord is unknown and path of theRecord ends with ".eml" then
set theRTF to convert record theRecord to rich
set theURL to reference URL of theRecord
set theSender to URL of theRecord
set theGroup to parent 1 of theRecord
set theName to name of theRecord
set theModificationDate to the modification date of theRecord
set theCreationDate to the creation date of theRecord
set theAdditionDate to the addition date of theRecord
set commentString to ""
set newGroup to false
set logString to currentCount & ": " & theName & " (" & theURL & ")"
log logString
try
if type of theRTF is rtfd then
set thePath to path of theRTF
tell text of theRTF
if exists attachment in attribute runs then
tell application "Finder"
set filelist to every file in ((POSIX file thePath) as alias)
repeat with theFile in filelist
set theAttachment to POSIX path of (theFile as string)
if theAttachment does not end with ".rtf" and theAttachment does not end with ".png" then
try
with timeout of 7200 seconds
-- Importing skips files inside the database package,
-- therefore let's move them to a temporary folder first
set theAttachment to move ((POSIX file theAttachment) as alias) to tmpFolder with replacing
set theAttachment to POSIX path of (theAttachment as string)
tell application id "DNtp"
if newGroup is false then
set newGroup to create record with {name:theName, type:group, modification date:theModificationDate, creation date:theCreationDate, addition date:theAdditionDate} in theGroup
end if
set importedFile to import theAttachment to newGroup
set URL of importedFile to theURL
set the modification date of importedFile to theModificationDate
set the creation date of importedFile to theCreationDate
--set importedPath to path of importedFile
--tell application "Finder"
-- set comment of ((POSIX file importedPath) as alias) to theURL
--end tell
set commentString to ((filename of importedFile) as string) & ";" & ((reference URL of importedFile) as string) & "|" & commentString
log commentString
end tell
end timeout
end try
end if
end repeat
end tell
end if
end tell
if newGroup is not false then
tell application id "DNtp"
move record theRecord to newGroup
set recordPath to path of theRecord
if commentString is not equal to "" then
tell application "Finder"
set comment of ((POSIX file recordPath) as alias) to commentString
end tell
end if
end tell
end if
end if
on error msg
display dialog msg
end try
end if
delete record theRTF
end repeat
end timeout
end tell