Separate imported e-mail attachments for better search

I’m working on expanding this script to be able to do something which has been asked in the forums sometimes: being able to replace attachments with DEVONthink links and index the attachments as ‘proper’ DT records. Wondering if there would be any tips or things to be aware of @cgrunenberg? I still have to test this on a larger archive, but so far it seems to be working.

I’m doing the following:

  • Adding a function to the AppleScript above which writes the names + reference URLs of the attachment to the .eml file on disk in the form of a Finder comment (see below for the updated script)
  • Running a Python script which goes through the .eml files on disk, strips the attachments and replaces them with the filenames + URLs found in the Finder comment of the .eml file

This is the Python script used (be sure to install xattr and bpylist:

import email.mime.text
from email import message_from_file
import os
import re
import uuid
import xattr
from bpylist import bplist

def main(path, filename=None):
    """Extract, store and remove attachments from all or a single mbox file in path."""
    iterator = [filename] if filename is not None else os.listdir(path)
    for filename in iterator:
        count = 0
        if filename.endswith('.eml'):
            count_before = count
            f = open(os.path.join(path, filename))
            msg = message_from_file(f)
            count = walk_over_parts(msg, count, path, filename)
            if count > count_before:
                print(msg)
            print('Removed {} attachments from {}.'.format(count, filename))


def walk_over_parts(parent, count, path, filename):
    """Walk over the parts of a parent and try to remove attachments.
    
    This function works recursive. So parent is a message, or a part of a message, or a subpart of a part, etc.
    """
    if not parent.is_multipart():
        return count
    for i, part in enumerate(parent.get_payload()):
        if part.get_content_type() in ["text/plain", "text/html"]:
            continue
        if part.is_multipart():
            count = walk_over_parts(part, count, path, filename)
            continue
        content_size, attachment_name = parse_attachment(part)
        if content_size is not None and content_size > 1e3:
            print('Removing attachment {} with size {:.0f} kB.'.format(attachment_name, content_size / 1e3))
            payload = parent.get_payload()
            comment = bplist.parse(xattr.getxattr(os.path.join(path, filename), 'com.apple.metadata:kMDItemFinderComment')).rstrip("|")
            payload[i] = get_replace_text(comment)
            parent.set_payload(payload)
            count += 1
    return count


def parse_attachment(part):
    """Parse the message part and find whether it's an attachment."""
    if not part.get_content_disposition() in ['inline', 'attachment']:
        return None, None
    attachment_name = part.get_filename()
    if attachment_name is None:
        attachment_name = create_default_name(part)
    if attachment_name is None:
        return None, None
    content = part.get_payload()
    assert type(content) is str
    content_size = len(content)
    return content_size, attachment_name


def create_default_name(part):
    for tup in part._headers:
        if tup[0] == 'Content-Type':
            """tup[1][6:] extracts 'png' from 'image/png' for example. Sometimes the value is image/x-png...
               Somehow, the 'x-' doesn't pose a problem. Not sure how it gets removed."""
            return part.get_content_disposition() + '-' + str(uuid.uuid4()) + '.' + tup[1][6:]


def get_replace_text(comment):
    """Return a message object to replace an attachment with."""
    replace_text = ""
    attachments = comment.split("|")
    for attachment in attachments:
        parts = attachment.split(";")
        print(parts)
        filename = parts[0]
        link = parts[1]
        replace_text = "\n\n<li><a href='{}'>{}</a></li>\r\n".format(link, filename) + replace_text
    return email.mime.text.MIMEText("<br/><br/><hr><br/><strong>Attachments:</strong><ul>{}</ul>".format(replace_text), 'html')


if __name__ == '__main__':
    main(path=/Users/yourname/E-mail/archive')

And this is the updated Applescript:

-- Import attachments of selected emails
property currentCount : 0

tell application id "DNtp"
	set theSelection to the selection
	set tmpFolder to path to temporary items
	set tmpPath to POSIX path of tmpFolder
	
	with timeout of 14400 seconds
		repeat with theRecord in theSelection
			set currentCount to currentCount + 1
			if type of theRecord is unknown and path of theRecord ends with ".eml" then
				set theRTF to convert record theRecord to rich
				set theURL to reference URL of theRecord
				set theSender to URL of theRecord
				set theGroup to parent 1 of theRecord
				set theName to name of theRecord
				set theModificationDate to the modification date of theRecord
				set theCreationDate to the creation date of theRecord
				set theAdditionDate to the addition date of theRecord
				set commentString to ""
				set newGroup to false
				
				set logString to currentCount & ": " & theName & " (" & theURL & ")"
				log logString
				
				try
					if type of theRTF is rtfd then
						set thePath to path of theRTF
						tell text of theRTF
							if exists attachment in attribute runs then
								tell application "Finder"
									set filelist to every file in ((POSIX file thePath) as alias)
									repeat with theFile in filelist
										set theAttachment to POSIX path of (theFile as string)
										
										if theAttachment does not end with ".rtf" and theAttachment does not end with ".png" then
											try
												with timeout of 7200 seconds
													
													-- Importing skips files inside the database package,
													-- therefore let's move them to a temporary folder first
													
													set theAttachment to move ((POSIX file theAttachment) as alias) to tmpFolder with replacing
													set theAttachment to POSIX path of (theAttachment as string)
													tell application id "DNtp"
														if newGroup is false then
															set newGroup to create record with {name:theName, type:group, modification date:theModificationDate, creation date:theCreationDate, addition date:theAdditionDate} in theGroup
														end if
														
														set importedFile to import theAttachment to newGroup
														set URL of importedFile to theURL
														set the modification date of importedFile to theModificationDate
														set the creation date of importedFile to theCreationDate
														
														--set importedPath to path of importedFile
														--tell application "Finder"
														--	set comment of ((POSIX file importedPath) as alias) to theURL
														--end tell
														
														set commentString to ((filename of importedFile) as string) & ";" & ((reference URL of importedFile) as string) & "|" & commentString
														log commentString
													end tell
												end timeout
											end try
										end if
									end repeat
								end tell
							end if
						end tell
						if newGroup is not false then
							tell application id "DNtp"
								move record theRecord to newGroup
								set recordPath to path of theRecord
								if commentString is not equal to "" then
									tell application "Finder"
										set comment of ((POSIX file recordPath) as alias) to commentString
									end tell
								end if
							end tell
						end if
						
					end if
				on error msg
					display dialog msg
				end try
			end if
			delete record theRTF
		end repeat
	end timeout
end tell
2 Likes