Splitting a PDF at a recurring string/expression

Some time ago, a user described how they were splitting a PDF manually at the string page 1 of x where x was the number of pages for each sub-document.

After some discussions, I arrived at the following JavaScript code that splits the currently selected PDF record into sub-documents, as defined by the variable baseName at the end of the code.

It relies on the regular expression 1\s+of\s+(\d+), ie

  • “1” followed by
  • at least one white space character
  • followed by “of”
  • followed by at least one white space character
  • and finally followed by any number of digits, which are collected in the first and only capturing group.

The content of the capturing group is used to determine the number of pages for each sub-document. Alternatively, one could modify the function splitAtRegEx so that it doesn’t rely on this capturing group but rather loops over the remaining pages in the original PDF to find the next occurrence of the regular expression.

The code has been tested. If it fails to split the PDF, you might try to re-OCR the PDF: Just because you see 1 of 5 printed in it, doesn’t mean that that’s what the current text layer contains. Even Apple’s “Pages” creates something like “5 1 of” in the text layer :frowning:

/*global ObjC, $, Application*/
ObjC.import('PDFKit');

/**
 * Write pages from PDFDoc to new file "baseName-fileNo.pdf"
 * 
 * @param {Number} firstPage - first page to print to new file
 * @param {Number} lastPage - last page to print to new file
 * @param {Number} fileNo - number of new file
 * @param {String} baseName - basename of new file
 * @param {PDFDocument} PDFDoc - original PDF document
 */
function writePDFPages(firstPage, lastPage,fileNo, baseName, PDFDoc ) {
//  console.log(`Writing from ${firstPage} to ${lastPage}`);

  const filename = $(`${baseName}${fileNo}.pdf`);
  const newPDF = $.PDFDocument.alloc.init;
  /* Loop over the pages */

  let targetPageNo = 0;
  for (let pageNo = firstPage; pageNo <= lastPage; pageNo++) {
    const page = PDFDoc.pageAtIndex(pageNo);
    newPDF.insertPageAtIndex(page, targetPageNo);
    targetPageNo++;
  }
  newPDF.writeToFile(filename);
}

/**
 * 
 * @param {string} path - POSIX path to PDF file
 * @param {string} baseName - base name of files to create, will be numbered from 1 to n
 * @param {string} pattern - pattern to use. It's first capturing group should contain the number of pages for the subdocument.
 */
function splitAtRegEx(path, baseName, pattern) {

  const pageRE = new RegExp(pattern);
  const pathURL = $.NSURL.fileURLWithPath($(path));
  const PDFDoc = $.PDFDocument.alloc.initWithURL(pathURL);
  
  const numPages = PDFDoc.pageCount;
  let i = 0;
  let fileNo = 1;
  while (i < numPages) {
    const page = PDFDoc.pageAtIndex(i);
    const text = page.string.js;
    const match = text.match(pageRE);
    if (match) {
      const pageCount = +match[1];
//      console.log(`${pageCount} pages in batch`);
      writePDFPages(i, i + pageCount - 1, fileNo, baseName, PDFDoc)
      i += +pageCount;
      fileNo++;
    } else {
      i++;
    }
  }
}

(() => {
/* Change the baseName to your preferred name */
  const baseName = '/Users/YOU/Desktop/NewPDF-';

  /* For DT 4, replace "DEVONThink 3" with "DEVONthink" */
  const record = Application("DEVONthink 3").selectedRecords[0];

  const path = record.path();
  splitAtRegEx(path, baseName, '1\\s+of\\s+(\\d+)');
})()
1 Like

Thanks for sharing this script!

Thanks for sharing. This works for me and my test files! Thumbs :+1: