See here for motivation:
Rather than hijacking that thread, I’ll start a new one, suggesting a self-contained JXA script. It does thus not rely on Python or other software to be installed. The fairly long code follows at the end.
It relies on the approach of @mdbraber: have DT convert an e-mail to RTFD first. In that process, all attachments are saved as separate files in the RTFD folder. Those matching the criteria defined at the top of the script (size of images etc., also stolen from @mdbraber) are then removed from the original e-mail and replaced by a list of links to them at the end of the e-mail.
The original e-mail remains untouched. Instead, the attachments and a new version of the e-mail are created in a new group. This is a precaution, because the script is only minimally tested (given the myriad of different e-mail variants), and valuable data shouldn’t be destroyed.
To summarize:
- Starting with an e-mail “mail.eml” containing the attachments “img1.jpg”, “pdf.pdf” and “img2.png”
- The script creates a group “mail” containing said attachments and a record “mail - without attachments.eml”. The latter contains links to all attachments in this group at the end. The attachments themselves have their “url” parameter set to the DT link of the fixed e-mail record. This record is, of course, noticeably smaller than the original.
The script operates on the current selection of records and follows (mostly) the approach outlined by @mdbraber.
Please use with care and on copies first. It’s possible that the code will not work with every single e-mail out there in the wild.
(() => {
/* Global values used throughout the code */
/* Tag added to processed e-mail records with attachments */
const replacedTagName = "attachments-extracted";
/* Tag added to processed e-mail records without attachments */
const notReplacedTagName = "no-attachments-extracted";
/* Regular Expression matching image filename extensions
- incomplete! */
const imageExtensions = new RegExp(
[".jpe?g", ".png", ".gif", ".tiff", ".tif", ".bmp"]
.map((s) => `\\${s}$`)
.join("|", "i")
);
/* Minimum size for an image attachment to be removed from the
original e-mail */
const imageMinSize = 10000;
/* Regular Expression matching filename extensions of
attachments to ignore */
const ignoreExtensions = new RegExp(
[".dat", ".rtf", ".[iv]cs"].map((s) => `\\${s}$`).join("|"),
"i"
);
/* Filenames of attachments to ignore */
const ignoreAttachments = ["winmail.dat", "application"];
/* Global variables for the application (DT),
temporary folder for storing the attachments before import into DT
the default FileNamager object and an error reference for ObjC calls */
const DTapp = Application("DEVONthink");
const tmpFolder =
Application("System Events").temporaryItemsFolder.posixPath();
const fileManager = $.NSFileManager.defaultManager;
const err = $();
/* Algorithm begins here */
/* Get only e-mails where the `replaceTagName` is not set
from the list of selected records */
const records = DTapp.selectedRecords().filter(
(r) =>
r.path().endsWith(".eml") &&
!r.tags().includes(replacedTagName) &&
!r.tags().includes(notReplacedTagName) &&
(r.recordType() === "email" || r.recordType() === "unknown")
);
/* Loop over the e-mails */
records.forEach((r) => {
const tags = r.tags();
/* Convert e-mail to RTF. DT creates files
for the raw e-mail text and all attachments in the process */
const convertedRecord = DTapp.convert({ record: r, to: "rich" });
/* Skip e-mails that are not RTFD, ie didn't contain attachments */
if (convertedRecord.recordType() !== "RTFD") {
return;
}
/* Get all files from the RTFD directory */
const rtfdPath = convertedRecord.path();
const attachmentNames = getAttachments(rtfdPath);
/* Get list of attachments to replace */
const validAttachments =
buildAttachmentList(attachmentNames, rtfdPath);
/* Skip this record if no attachments are to be replaced
and set the "notReplaced" tag */
if (validAttachments.length === 0) {
r.tags = tags.concat(notReplacedTagName);
DTapp.delete({ record: convertedRecord });
return;
}
const UUIDmap = {};
const bareName = r.nameWithoutExtension();
/* create target group for attachments */
const targetGroup = DTapp.createRecordWith({
name: bareName,
"record type": "group"},
{in: r.locationGroup()}
);
copyDates(r, targetGroup);
/* Now import each of the attachments into DT,
set its attributes to those of the original e-mail
and remove the tmp copy of the attachment
Attachments are copied to a tmp folder first, because DT refuses
to import its own files again */
validAttachments.forEach((attachment) => {
const tmpPath = copyToTmp(`${rtfdPath}/${attachment}`, tmpFolder);
const importedAttachment = DTapp.importPath(tmpPath, {
to: targetGroup,
});
/* Set properties of imported attachment */
copyDates(r, importedAttachment);
/* Save name/UUID relationship to object */
UUIDmap[attachment] = importedAttachment.referenceURL();
/* remove temporary copy of attachment */
fileManager.removeItemAtPathError(tmpPath, err);
});
/* Replace the attachments in the original e-mail,
add the replaced tag to the original record,
and delete the RTFD group
*/
const newRawMail = replaceAttachments(r, UUIDmap);
/* Create a _new_ e-mail record containing the fixed original,
in the same group as the attachments
*/
const newMailRecord = DTapp.createRecordWith({
name: `${bareName} - no Attachments.eml`,
"record type": "unknown",
source: newRawMail,
"creation date": r.creationDate(),
"modification Date": r.modificationDate()},
{ in: targetGroup }
);
const newURL = newMailRecord.referenceURL();
const newUUID = newMailRecord.uuid();
/* Set the attachments' `url` property to the DT item link
of the fixed e-mail record */
targetGroup
.children()
.filter((c) => c.uuid() !== newUUID)
.forEach((c) => (c.url = newURL));
/* Set the `replace attachment` tag for the original e-mail
and delete the RTFD record */
r.tags = tags.concat(replacedTagName);
DTapp.delete({ record: convertedRecord });
});
/* UTILITY FUNCTIONS follow */
/* Return an array of all files in the RTFD directory.
If a filename extension is 'jpeg.jpg', change it to 'jpeg'
DT creates `jpeg.jpg` for some reason.
*/
function getAttachments(path) {
const fileNames = fileManager
.contentsOfDirectoryAtPathError($(path), err)
.js.map((n) => {
const filename = n.js;
/* Fix 'jpeg.jpg' file extension */
if (!filename.endsWith("jpeg.jpg")) {
return filename;
}
const newName = filename.replace(/\.jpg$/, "");
const result = fileManager.moveItemAtPathToPathError(
`${path}/${filename}`,
`${path}/${newName}`,
err
);
if (!result) {
throw new Error(
`Changing "${filename}" to "${newName}" failed: ${err.code} "${err.localizedDescription.js}"`
);
}
return newName;
});
return fileNames;
}
/* Copy `srcPath` to `tmpDir` and return full path of copy */
function copyToTmp(srcPath, tmpDir) {
const tmpPath = `${tmpDir}/${srcPath.replace(/^.*\//, "")}`;
const result = fileManager.copyItemAtPathToPathError(srcPath, tmpPath, err);
if (!result) {
throw new Error(`copying "${srcPath}" => "${tmpPath}" failed!
${err.localizedDescription.js}`);
}
return tmpPath;
}
/* Return the e-mail part with the links to the replaced attachments */
function buildHTMLAttachment(replacements) {
return (
`Content-Transfer-Encoding: 8bit
Content-Type: text/html;
charset=utf-8\n\n` +
`<html><body style="font-family: helvetica; font-size: large;"><br/><br/><hr><p><strong>Attachments:</strong></p><ul>` +
Object.keys(replacements)
.map(
(filename) =>
`\n<li><a href="${replacements[filename]}?reveal=1">${filename}</a></li>\n`
)
.join("") +
`</ul></body></html>`
);
}
/* Copy modification and creation date from one record to another one */
function copyDates(from, to) {
["modificationDate", "creationDate"].forEach(
(prop) => (to[prop] = from[prop]())
);
}
/* Build list of attachments that are to be replaced by links
filenames: list of all filenames in the RTFD folder
rtfdPath: Path to RTFD folder
*/
function buildAttachmentList(fileNames, rtfdPath) {
const validAttachments = fileNames.filter((n) => {
/* Ignore the RTF document itself */
if (n === "TXT.rtf") {
return false;
}
const filePath = `${rtfdPath}/${n}`;
const attributes = fileManager.attributesOfItemAtPathError(
$(filePath),
err
);
const size = attributes.fileSize;
/* filter out attachments
- which are empty
- which are images and smaller than imageMinSize
- with types (extensions) that don't need saving (ignoreExtensions)
- with names that don't need saving (ignoreAttachments)
*/
return !(
size === 0 ||
(imageExtensions.test(n) && size <= imageMinSize) ||
ignoreExtensions.test(n) ||
ignoreAttachments.includes(n)
);
});
return validAttachments;
}
/* Remove the matching attachments in record
and append an HTML part to the e-mail with links to
the exported attachments
Replacement contains filename:UUID pairs
*/
function replaceAttachments(record, replacements) {
/* Read the original e-mail */
const content = $.NSString.stringWithContentsOfFileEncodingError(
$(record.path()),
$.NSUTF8StringEncoding,
err
).js;
/* Build a regular expression to match all boundaries.
Find all 'boundary' strings first */
const boundaries = [...content.matchAll(/boundary="?(.*?)"?;?\n/gms)];
/* Run all found boundaries together in a big alternation, complete with leading and possibly trailing '--' */
const boundaryRE = new RegExp(`(--(?:${boundaries.map((b) => b[1]).join("|")}(?:--)?)\n)`, 'm');
const parts = content.split(boundaryRE);
/* Walk backwards over all parts of the e-mail.
If the filename matches one of the elements in `replacements`,
remove the attachment and it's boundary from `parts`
*/
let i = parts.length - 1;
while (i > 1) {
// Get the filename from this part.
const filename = getFilenameFromPart(parts[i]);
if (filename && Object.keys(replacements).includes(filename)) {
/* i is the index into `parts` with the matching filename
i-1 is the boundary
i is the attachment including the header
Remove these two parts
*/
parts.splice(i - 1, 2);
i -= 2;
} else {
/* This part contains no matching part, keep it */
i--;
}
}
/* Append the list of links to attachments to the remaining
parts of the e-mail */
const lastBoundary = parts[parts.length - 2];
const txt =
parts.slice(0, -2).join("") +
lastBoundary.replace(/--\n/, "") +
`\n` +
/* buildHTMLAttachment returns list of links as HTML part */
buildHTMLAttachment(replacements) +
`\n\n${lastBoundary}`;
return txt;
}
/* Passed in an e-mail part, function returns
- a filename if it exists in the part's header
- undefined else.
*/
function getFilenameFromPart(part) {
/* Split the current part at two subsequent empty lines. The first of these elements is the attachment header */
const subparts = part.split(`\n\n`);
/* Split the first subpart into lines, store them in "header" */
const header = subparts[0].split(`\n`);
/* If the header contains only one line, it's a boundary – return nothing */
if (header.length === 1) {
return undefined;
}
const filenameRaw = header.filter((h) => h.includes("filename"));
if (filenameRaw.length === 0) {
return undefined;
}
/* Return the filename, possibly decoding it if it's in UTF-8 */
const filenameMatch = filenameRaw[0].match(/filename\*?="?(.*?)"?$/);
if (filenameMatch[1].startsWith("utf-8")) {
/* remove "utf-8''" from the filename and decode it.*/
return decodeURIComponent(filenameMatch[1].replace(/^utf-8''/, ""));
} else {
return filenameMatch[1];
}
}
})()