Script: Extract PDF URLs

This handler extracts all URLs of a PDF and optionally filters them.

Filtering

  • Filter by URL start.
  • Filter by URL end.
  • Filter by URL start and URL end.
  • Filter can either "include" or "exclude" the passed lists.

Handler

-- Script Library - Extract PDF URLs

use AppleScript version "2.4"
use framework "Foundation"
use framework "Quartz"
use scripting additions

on extractPDF_URLs(theRecord, theMode, theURLBeginsWith_list, theURLEndsWith_list)
	try
		tell application id "DNtp"
			try
				set theType to type of theRecord as string
				if theType is not in {"PDF document", "«constant ****pdf »"} then
					display alert "Error" message "Please make sure to select a PDF document" as warning giving up after 60
					return
				end if
				set thePath to path of theRecord
				
				if theMode = "include" then
					set exclude to ""
					set theConnector to " OR "
				else if theMode = "exclude" then
					set exclude to "! "
					set theConnector to " AND "
				else if theMode = "" then
					set theURLBeginsWith_list to {}
					set theURLEndsWith_list to {}
				else
					display alert "Error" message "Please make sure to use either \"include\" or \"exclude\"" as warning giving up after 60
					return
				end if
				
			on error error_message number error_number
				if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
				error number -128
			end try
		end tell
		
		set thePDF to current application's PDFDocument's alloc()'s initWithURL:(current application's |NSURL|'s fileURLWithPath:thePath)
		set thePDF_PageCount to thePDF's pageCount()
		set thePDFAnnotationSubtypeLink to (current application's PDFAnnotationSubtypeLink)
		set thePDF_URLs_all to (current application's NSMutableArray's arrayWithArray:{})
		
		repeat with i from 0 to (thePDF_PageCount - 1)
			set thePage to (thePDF's pageAtIndex:i)
			set thePage_Annnotations to thePage's annotations()
			set thePage_Annnotations_Count to thePage_Annnotations's |count|()
			if thePage_Annnotations_Count > 0 then
				repeat with j from 0 to (thePage_Annnotations_Count - 1)
					set thisAnnotation to (thePage_Annnotations's objectAtIndex:j)
					if ((thisAnnotation's valueForAnnotationKey:(current application's PDFAnnotationKeySubtype))'s isEqualTo:thePDFAnnotationSubtypeLink) then
						set thisURL to thisAnnotation's |URL|()
						if thisURL ≠ missing value then
							(thePDF_URLs_all's addObject:(thisURL's absoluteString()))
						end if
					end if
				end repeat
			end if
		end repeat
		
		if thePDF_URLs_all's |count|() > 0 then
			
			set thePDF_URLs_deduplicated to (current application's NSOrderedSet's orderedSetWithArray:thePDF_URLs_all)'s array()
			
			if theURLBeginsWith_list ≠ {} or theURLEndsWith_list ≠ {} then
				set theArray_BeginsWith to current application's NSMutableArray's arrayWithArray:theURLBeginsWith_list
				set theArray_EndsWith to current application's NSMutableArray's arrayWithArray:theURLEndsWith_list
				
				set thePredicateString_BeginsWithArray to current application's NSMutableArray's arrayWithArray:{}
				set thePredicateString_EndsWithArray to current application's NSMutableArray's arrayWithArray:{}
				
				set theCount to 0
				repeat with thisArray in {theArray_BeginsWith, theArray_EndsWith}
					if theCount = 0 then
						repeat with i from 0 to ((thisArray's |count|()) - 1)
							(thePredicateString_BeginsWithArray's addObject:(current application's NSString's stringWithFormat_("(" & exclude & "self BEGINSWITH [cd] '%@')", (thisArray's objectAtIndex:i))))
						end repeat
					else
						repeat with i from 0 to ((thisArray's |count|()) - 1)
							(thePredicateString_EndsWithArray's addObject:(current application's NSString's stringWithFormat_("(" & exclude & "self ENDSWITH [cd] '%@')", (thisArray's objectAtIndex:i))))
						end repeat
					end if
					set theCount to theCount + 1
				end repeat
				
				set thePredicateString_BeginsWithArray_Count to thePredicateString_BeginsWithArray's |count|()
				if thePredicateString_BeginsWithArray_Count > 0 then
					if thePredicateString_BeginsWithArray_Count = 1 then
						set thePredicateString_BeginsWith to thePredicateString_BeginsWithArray's firstObject()
					else if thePredicateString_BeginsWithArray_Count > 1 then
						set thePredicateString_BeginsWith to (current application's NSString's stringWithFormat_("(%@)", ((thePredicateString_BeginsWithArray's componentsJoinedByString:theConnector))))
					end if
				end if
				
				set thePredicateString_EndsWithArray_Count to thePredicateString_EndsWithArray's |count|()
				if thePredicateString_EndsWithArray_Count > 0 then
					if thePredicateString_EndsWithArray_Count = 1 then
						set thePredicateString_EndsWith to thePredicateString_EndsWithArray's firstObject()
					else if thePredicateString_EndsWithArray_Count > 1 then
						set thePredicateString_EndsWith to (current application's NSString's stringWithFormat_("(%@)", ((thePredicateString_EndsWithArray's componentsJoinedByString:theConnector))))
					end if
				end if
				
				if thePredicateString_BeginsWithArray_Count > 0 and thePredicateString_EndsWithArray_Count > 0 then
					set thePredicateString to thePredicateString_BeginsWith's stringByAppendingFormat_(" AND %@", thePredicateString_EndsWith)
				else if thePredicateString_BeginsWithArray_Count > 0 and thePredicateString_EndsWithArray_Count = 0 then
					set thePredicateString to thePredicateString_BeginsWith
				else if thePredicateString_BeginsWithArray_Count = 0 and thePredicateString_EndsWithArray_Count > 0 then
					set thePredicateString to thePredicateString_EndsWith
				end if
				
				set thePDF_URLs_deduplicated_filtered to thePDF_URLs_deduplicated's filteredArrayUsingPredicate:(current application's NSPredicate's predicateWithFormat:thePredicateString)
				set thePDF_URLs to thePDF_URLs_deduplicated_filtered
			else
				set thePDF_URLs to thePDF_URLs_deduplicated
			end if
			set thePDF_URLs to thePDF_URLs's sortedArrayUsingSelector:"localizedStandardCompare:"
			
		else
			set thePDF_URLs to thePDF_URLs_all
		end if
		
		return thePDF_URLs as list
		
	on error error_message number error_number
		activate
		display alert "Error: Handler \"extractPDF_URLs\"" message error_message as warning
		error number -128
	end try
end extractPDF_URLs

Examples

Note: All examples use the handler as Script Library.

Include

Example - Extract all URLs

To extract all URLs without any filtering pass empty lists.

Click
-- Example - Extract all URLs

tell application id "DNtp"
	try
		set theRecords to selected records
		if theRecords = {} then error "Please select a PDF record"
		set thisRecord to item 1 of theRecords
		
		set theMode to "include"
		set theURLBeginsWith_list to {}
		set theURLEndsWith_list to {}		

		set thePDF_URLs to script "Extract PDF URLs"'s extractPDF_URLs(thisRecord, theMode, theURLBeginsWith_list, theURLEndsWith_list)
		
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
		return
	end try
end tell

Example - Extract URLs that start with “http://” or “https://”

Click
-- Example - Extract URLs that start with "http://" or "https://"

tell application id "DNtp"
	try
		set theRecords to selected records
		if theRecords = {} then error "Please select a PDF record"
		set thisRecord to item 1 of theRecords
				
		set theMode to "include"
		set theURLBeginsWith_list to {"http://", "https://"}
		set theURLEndsWith_list to {}
		
		set thePDF_URLs to script "Extract PDF URLs"'s extractPDF_URLs(thisRecord, theMode, theURLBeginsWith_list, theURLEndsWith_list)
		
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
		return
	end try
end tell


Example - Extract URLs that end with “.pdf”

Click
-- Example - Extract URLs that end with ".pdf"

tell application id "DNtp"
	try
		set theRecords to selected records
		if theRecords = {} then error "Please select a PDF record"
		set thisRecord to item 1 of theRecords
				
		set theMode to "include"
		set theURLBeginsWith_list to {}
		set theURLEndsWith_list to {".pdf"}
		
		set thePDF_URLs to script "Extract PDF URLs"'s extractPDF_URLs(thisRecord, theMode, theURLBeginsWith_list, theURLEndsWith_list)
		
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
		return
	end try
end tell


Exclude

Example - Extract URLs that don’t start with “mailto:” or “tel:” or “bitcoin:” or “file:” and don’t end with “.gif” or “.jpg” or “.jpeg” or “.png”

Click
-- Example - Extract URLs that don't start with "mailto:" or "tel:" or "bitcoin:" or "file:" and don't end with ".gif" or ".jpg" or ".jpeg" or ".png"

tell application id "DNtp"
	try
		set theRecords to selected records
		if theRecords = {} then error "Please select a PDF record"
		set thisRecord to item 1 of theRecords
				
		set theMode to "exclude"
		set theURLBeginsWith_list to {"mailto:", "tel:", "bitcoin:", "file:"}
		set theURLEndsWith_list to {".gif", ".jpg", ".jpeg", ".png"}
		
		set thePDF_URLs to script "Extract PDF URLs"'s extractPDF_URLs(thisRecord, theMode, theURLBeginsWith_list, theURLEndsWith_list)
		
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
		return
	end try
end tell


Setup

Usage as Script Library:

It is best used as Script Library.

  • In Finder:
    • press ⇧⌘G
    • paste ~/Library/Script Libraries
    • if no window is opened
      • paste ~/Library/
      • create folder Script Libraries
  • In Script Editor.app
    • create new document
    • paste the handler
    • press ⌘S
    • press ⇧⌘G
    • paste ~/Library/Script Libraries/Extract PDF URLs.scpt
    • save

To call the Script Library use:

set theMode to "include"
set theURLBeginsWith_list to {}
set theURLEndsWith_list to {}

set thePDF_URLs to script "Extract PDF URLs"'s extractPDF_URLs(thisRecord, theMode, theURLBeginsWith_list, theURLEndsWith_list)

Usage inline:

If you want to use it inline make sure to add these statements at the top of your script:

use AppleScript version "2.4"
use framework "Foundation"
use framework "Quartz"
use scripting additions

To call the handler inline use my :

set theMode to "include"
set theURLBeginsWith_list to {}
set theURLEndsWith_list to {}

set thePDF_URLs to my extractPDF_URLs(thisRecord, theMode, theURLBeginsWith_list, theURLEndsWith_list)

Thanks for the great script! By the way, via the user interface one option is to use Tools > Inspectors > Document > Links, select all links and copy them. Afterwards they can be e.g. pasted to bookmarks in Safari, the download manager of DEVONagent/think or a search set in DEVONagent.

1 Like