Script to look up PDF document metadata on crossref.org

mnot · September 21, 2021, 5:35am

I hate AppleScript. Really hate it. But I love Devonthink, and had an itch to scratch. This is somewhat similar to other scripts that do bibliographic lookups, but it uses crossref.org.

gist.github.com

https://gist.github.com/mnot/0d7825bde9b9d3233f623c71765f20ca

Crossref Lookup.scpt


-- Look up document metadata on CrossRef.org in DevonThink 3
-- 
-- Currently sets:
-- * Created and Modified dates to the document's publication date
-- * Title in document properties
-- * Author(s) in document properties
-- * A tag for the type of document
-- 
--

This file has been truncated. show original

Feedback, etc. welcome.

cgrunenberg · September 21, 2021, 6:19am

Looks like an interesting script. However, as it requires either a text selection or a certain name I wonder what kind of input crossref.org accepts/expects.

By the way, do your documents contain a DOI (digital object identifier)? Then the smart rule script Download Bibliographic Metadata might be useful too.

cgrunenberg · September 21, 2021, 6:22am

Okay, I should obviously read the comments how to use it first

mnot · September 21, 2021, 6:56am

It uses crossref’s query.bibliographic, which allows pretty broad input; authors, ISBNs, titles, etc.

My workflow when I download papers is to usually to set the name of the paper using control-command-I (awesome feature, was so happy when I found that), that’s why it takes the name if you don’t select any text.

Some papers contain DOIs, but many don’t I might make it a bit more nuanced if there’s existing metadata…

BLUEFROG · September 22, 2021, 3:43pm

For hating AppleScript, it looks like you made something useful… and I hope that itch is gone now

extracampine · December 20, 2021, 11:04pm

Hi, I followed your instructions (I think!), but when I select the script in DT, nothing happens?

chrillek · December 21, 2021, 11:27am

Since I’m not particulary fond of AS either, I took the liberty to rewrite the script in JavaScript. It seems to work (tested it with one record only, though). Also, I stumbled upon results from the API call without a title field, the script then fills in a stupid placeholder. Don’t know if that is reasonable or not.
Here goes

(() => {
  const app = Application("DEVONthink 3");
  /* Need currentApplication() for user interaction */
  const curApp = Application.currentApplication();
  curApp.includeStandardAdditions=true;
  /* Basic error checking */
  const thinkWindow = app.thinkWindows();
  const contentRec = app.contentRecord();
  if (!thinkWindow|| !thinkWindow[0]) throw "No window open";
  if (!contentRec) throw "No document selected";

  /* Query for either selected text or the name of the record if no text is selected */
  const query = thinkWindow[0].selectedText() || contentRec.name();
  const apiURL = 'https://api.crossref.org/works';
  const shellCmd = `curl -A "(https://gist.github.com/mnot/0d7825bde9b9d3233f623c71765f20ca)" -G ${apiURL} --data-urlencode query.bibliographic='${query}' -d rows=5 -d select=author,title,created,type,publisher,published,subject`;
  const apiResult = JSON.parse(curApp.doShellScript(shellCmd));

/* Basic error checking for the return value of the API call */
  if (apiResult.status !== "ok") {
    throw "API response not OK: " & apiResult.message;
  }
  const itemList = apiResult.message.items;
  if (itemList.length === 0) {
    curApp.displayAlert("No matches found!");
	return;
  }

  /* Arrays to save the dates and authors, no need to extract them twice */
  const choices = [], dates = [], authors = [];

  /* Build list of choices from results */
  itemList.forEach(item => {
	const title = item.title ? item.title[0] : "No Title?";
    let detailString = extractAuthor(item);
	authors.push(detailString);
	const dateString = extractDate(item);
	dates.push(dateString);
	detailString += `${detailString ? ', ' : ''}${dateString}`;
	choices.push(`${title} (${detailString})`);
  })
  const selection = curApp.chooseFromList(choices, {withPrompt: "Select:"});
  if (!selection) return;
  
  /* User selected fromt the list of references: get the selected index */
  const selectedIndex = choices.indexOf(selection[0]);

  /* get the corresponding item from the API result */
  const selectedItem = apiResult.message.items[selectedIndex];

  /* get the corresponding date */
  const selectedDate = dates[selectedIndex].split('-');
  selectedDate[1]--; /* months are 0-indexed in JS! */
  
  /* Set the record's date to the date of the selected item */
  
  contentRec.date = new Date(...selectedDate);

  /* add the 'type' field from the result to the record's tags */
  const typeTag = selectedItem.type;
  const recordTags = contentRec.tags();
  if (recordTags.indexOf(typeTag) === -1) {
    recordTags.push(typeTag);
	contentRec.tags = recordTags;
  }
  
  /* Set PDF metadata fields Titel and Author */
  const recordPath = contentRec.path();
  setPDFMetadata(curApp, "Title", selectedItem.title[0], recordPath);
  setPDFMetadata(curApp, "Author", authors[selectedIndex], recordPath);
})()


function setPDFMetadata(curApp, key, value, path) {
  const pathToExiftool = '/usr/local/bin/exiftool';
  const shellCommand = `${pathToExiftool} -overwrite_original -${key}='${value}' '${path}'`;
  curApp.doShellScript(shellCommand);
}

/* Extract the first author from one item of the API result */
function extractAuthor(item) {
  if (!item.author) return "";
  const firstAuthor = item.author[0];
  let authorString = `${firstAuthor.given || ""} ${firstAuthor.family || ""}`;
  return authorString.trim();
}

/* extract the published date from the API result. 
If it does not exist or is not complete, use create date */
function extractDate(item) {
  let year, month, day;
  if (item.published && item.published['date-parts']) {
    [year, month, day] = item.published['date-parts'][0];
  } 
  /* if one of year, month or day are still undefined, get values from 'created' field */
  if (!(year && month && day)) {
    [year, month, day] = item.created['date-parts'][0];
  }   
  return `${year}-${month}-${day}`;
}

AW2307 · May 4, 2022, 9:37am

Thanks for sharing the script @mnot.

I’m encountering the same issue as @extracampine after following the installation instructions.

Has anyone found the cause or, even better, a way to fix this?

chrillek · May 4, 2022, 9:58am

Is anything reported in DT’s log window?

AW2307 · May 4, 2022, 10:36am

Nothing is logged and there is no visible sign that anything is happening otherwise.

chrillek · May 4, 2022, 12:18pm

You could run the script from script editor and open its message area. Then you’ll see all Apple everts happening (or not) which might give some ideas as to what’s going on (or not)

AW2307 · May 4, 2022, 12:35pm

That was a great hint, thanks!
The script shows the selection dialogue as expected if I run it through script editor. If I instead run the same script by clicking on the entry in DT’s script menu, nothing happens. Will try some stuff and report back if something solves it.

// Edit: It’s working now in DT! In case it helps someone, these are the steps that seem to have worked:

Apparently homebrew does not always automatically install applications to the usr/local/bin path - however, this is the path referenced in the script. After not finding Exiftool there although it was already installed, I found it in my homebrew install directory and changed the path in the script accordingly. Then, after running the script through Script Editor once and then trying again in DT, I got the below message and confirmed. It’s now fully functional - looking forward to further testing.

chrillek · May 4, 2022, 1:27pm

Good to see that it works now. Bad to see that Apple(Script) doesn’t throw an error when it can’t find the external program.

And another argument for going with self-contained scripts as much as possible. The same holds, btw, for JSONHelper. If one used JavaScript instead of AppleScript, this tools were not necessary at all. Less stuff to install, less cruft, less problems.

It would actually be possible to set the PDF metadata using Objective-C from the script, avoiding exiftool completely. Probably more elegantly, too, than calling exiftool for Title and Author separately.

mnot · May 4, 2022, 10:53pm

I’ve updated the script, including the installation instructions; see how that goes for you (note especially the osacompile step).

wrothnie · July 5, 2022, 2:49am

Does this script still exist? It does not appear to be one installed in Devonthink 3 Pro (at least on my computer) and I couldn’t find it in More Scripts…

BLUEFROG · July 5, 2022, 6:21am

It’s a smart rule script for use in smart rules with an Execute Script action.

AWD · August 1, 2022, 8:17am

Hi @BLUEFROG,

I modified the script of @mnot to also extract the DOI from crossref.org I can put the DOI into one of the metadata fields via exiftool in the same way author and title are written. At the moment I am writing the DOI data in the subject field of the file, since the lag of a better field available at the moment.

But I would like to go one step further and pass the DOI into the custom metadata field DOI provided by Devonthink. Because when I do that I can run the smart rule script mentioned before to fill even more metadata automatically.

But I couldn’t find a way to access that field in the apple script, can you help me out here please.

Below the modified version of the script. I added all the DOI parts, rest is the same as before.


-- Look up document metadata on CrossRef.org in DevonThink 3
-- 
-- Currently sets:
-- * Created date to the document's publication date
-- * Title in document properties
-- * Author in document properties (first author only)
-- * A tag for the type of document
-- 
--
-- To install:
-- 0. Quit DevonThink
-- 1. Place this script in the DevonThink Scripts folder (`Scripts -> Open Scripts Folder`)
-- 2. In Terminal, navigate to that folder and run `osacompile -o Crossref\ Lookup.scpt crossref-lookup.applescript; rm crossref-lookup.applescript`
-- 3. Install JSON Helper: https://apps.apple.com/au/app/json-helper-for-applescript/id453114608?mt=12
-- 4. Install exiftool: `brew install exiftool` (using homebrew; see https://brew.sh)
-- 5. Give the script a keyboard shortcut (`System Preferences -> Keyboard -> Shortcuts -> App Shortcuts`) (optional)
--
-- To use:
-- 0. Open a PDF in DevonThink
-- 1. Using your cursor, select a paper's title, ISSN, or similar identifying information
-- 2. Trigger the script (e.g., using they keyboard shortcut
-- 3. Select the best fitting candidate from the list
-- 4. Click `OK`


tell application id "DNtp"
	try
		if not (exists think window 1) then error "No window open."
		if not (exists content record) then error "No document selected."
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
		return
	end try
	
	set theRecord to content record
	set query to the selected text of think window 1 as string
	if query is "" then
		set query to name of theRecord
	end if
	
	show progress indicator "Looking up references..." steps -1
end tell

-- Make an API request to crossref.org
set apiURL to "https://api.crossref.org/works"
set shellScript to ("curl -A \"(https://gist.github.com/mnot/0d7825bde9b9d3233f623c71765f20ca)\" -G " & apiURL ¬
	& " --data-urlencode query.bibliographic=" & quoted form of query ¬
	& " -d rows=5 -d select=author,title,created,type,publisher,published,subject,DOI")
set apiResult to (do shell script shellScript)

tell application id "DNtp"
	hide progress indicator
end tell

-- parse json
tell application "JSON Helper"
	set json to read JSON from apiResult
end tell

if not status of json is equal to "OK" then
	error "API response not OK: " & message of json
	return
end if

-- Populate and display a dialogue
try
	set theItems to |items| of message of json
	set theChoices to {}
	repeat with a from 1 to length of theItems
		set currentItem to item a of theItems
		set titleList to title of currentItem
		set title to item 1 of titleList
		set dateString to my extractDate(currentItem)
		set detailString to my extractAuthor(currentItem)
		if not detailString is equal to "" then
			set detailString to detailString & ", "
		end if
		set detailString to detailString & dateString
		set theChoice to title & " (" & detailString & ")"
		set end of theChoices to theChoice
	end repeat
on error error_message number error_number
	if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
	return
end try

set theChoice to choose from list theChoices with prompt "Select:"
if theChoice is false then return

repeat with a from 1 to length of theChoices
	set currentItem to item a of theChoices
	if currentItem as string is equal to theChoice as string then
		set chosenItem to item a of theItems
	end if
end repeat

-- update document
set dateString to my extractDate(chosenItem)
set appleDate to date (dateString)

tell application id "DNtp"
	try
		if not (exists think window 1) then error "No window open."
		if not (exists content record) then error "No document selected."
		set theRecord to content record
		set recordPath to path of theRecord
		-- date
		set the date of theRecord to appleDate
		-- tags
		set typeTag to |type| of chosenItem
		if typeTag is not in tags of theRecord then
			set tagList to tags of theRecord
			set end of tagList to typeTag
			set tags of theRecord to tagList
		end if
		-- title
		set recordTitle to item 1 of title of chosenItem
		my setPDFMetadata("Title", recordTitle, recordPath)
		-- author
		set authorString to my extractAuthor(chosenItem)
		my setPDFMetadata("Author", authorString, recordPath)
		-- DOI
		set doiString to my extractDOI(chosenItem)
		my setPDFMetadata("Subject", doiString, recordPath)
		
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
	end try
end tell

on extractDate(record_item)
	try
		set [iyear, imonth, iday] to item 1 of |date-parts| of published of record_item
	on error
		set [iyear, imonth, iday] to item 1 of |date-parts| of created of record_item
	end try
	set dateString to ((iday as string) & "-" & imonth as string) & "-" & iyear as string
	return dateString
end extractDate

on extractDOI(record_item)
	try
		set doi_entries to DOI of record_item
	on error
		set doi_entries to "missing"
	end try
	set doiString to doi_entries
	return doiString
end extractDOI

on extractAuthor(record_item)
	set firstAuthor to ""
	try
		set author_entries to author of record_item
	on error
		return ""
	end try
	repeat with a from 1 to length of author_entries
		set this_entry to item a of author_entries
		if sequence of this_entry is equal to "first" then
			set firstAuthor to this_entry
		end if
	end repeat
	if firstAuthor is equal to "" then
		return ""
	end if
	try
		set authorString to |given| of firstAuthor & " " & family of firstAuthor
	on error
		try
			set authorString to family of firstAuthor
		on error
			set authorString to |name| of firstAuthor
		end try
	end try
	return authorString
end extractAuthor

on setPDFMetadata(mdKey, mdValue, recordPath)
	set shellCommand to "/opt/homebrew/bin/exiftool -overwrite_original -" & mdKey & "=" & quoted form of mdValue & " " & quoted form of recordPath
	set exifresult to do shell script shellCommand
end setPDFMetadata

Thanks & BR
AWD

AWD · August 1, 2022, 1:19pm

Problem solved


-- Look up document metadata on CrossRef.org in DevonThink 3
-- 
-- Currently sets:
-- * Created date to the document's publication date
-- * Title in document properties
-- * Author in document properties (first author only)
-- * A tag for the type of document
-- 
--
-- To install:
-- 0. Quit DevonThink
-- 1. Place this script in the DevonThink Scripts folder (`Scripts -> Open Scripts Folder`)
-- 2. In Terminal, navigate to that folder and run `osacompile -o Crossref\ Lookup.scpt crossref-lookup.applescript; rm crossref-lookup.applescript`
-- 3. Install JSON Helper: https://apps.apple.com/au/app/json-helper-for-applescript/id453114608?mt=12
-- 4. Install exiftool: `brew install exiftool` (using homebrew; see https://brew.sh)
-- 5. Give the script a keyboard shortcut (`System Preferences -> Keyboard -> Shortcuts -> App Shortcuts`) (optional)
--
-- To use:
-- 0. Open a PDF in DevonThink
-- 1. Using your cursor, select a paper's title, ISSN, or similar identifying information
-- 2. Trigger the script (e.g., using they keyboard shortcut
-- 3. Select the best fitting candidate from the list
-- 4. Click `OK`

tell application id "DNtp"
	try
		if not (exists think window 1) then error "No window open."
		if not (exists content record) then error "No document selected."
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
		return
	end try
	
	set theRecord to content record
	set query to the selected text of think window 1 as string
	if query is "" then
		set query to name of theRecord
	end if
	
	show progress indicator "Looking up references..." steps -1
end tell

-- Make an API request to crossref.org
set apiURL to "https://api.crossref.org/works"
set shellScript to ("curl -A \"(https://gist.github.com/mnot/0d7825bde9b9d3233f623c71765f20ca)\" -G " & apiURL ¬
	& " --data-urlencode query.bibliographic=" & quoted form of query ¬
	& " -d rows=5 -d select=author,title,created,type,publisher,published,subject,DOI")
set apiResult to (do shell script shellScript)

tell application id "DNtp"
	hide progress indicator
end tell

-- parse json
tell application "JSON Helper"
	set json to read JSON from apiResult
end tell

if not status of json is equal to "OK" then
	error "API response not OK: " & message of json
	return
end if

-- Populate and display a dialogue
try
	set theItems to |items| of message of json
	set theChoices to {}
	repeat with a from 1 to length of theItems
		set currentItem to item a of theItems
		set titleList to title of currentItem
		set title to item 1 of titleList
		set dateString to my extractDate(currentItem)
		set detailString to my extractAuthor(currentItem)
		if not detailString is equal to "" then
			set detailString to detailString & ", "
		end if
		set detailString to detailString & dateString
		set theChoice to title & " (" & detailString & ")"
		set end of theChoices to theChoice
	end repeat
on error error_message number error_number
	if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
	return
end try

set theChoice to choose from list theChoices with prompt "Select:"
if theChoice is false then return

repeat with a from 1 to length of theChoices
	set currentItem to item a of theChoices
	if currentItem as string is equal to theChoice as string then
		set chosenItem to item a of theItems
	end if
end repeat

-- update document
set dateString to my extractDate(chosenItem)
set appleDate to date (dateString)

tell application id "DNtp"
	try
		if not (exists think window 1) then error "No window open."
		if not (exists content record) then error "No document selected."
		set theRecord to content record
		set recordPath to path of theRecord
		-- date
		set the date of theRecord to appleDate
		-- tags
		set typeTag to "import_Metadata"
		if typeTag is not in tags of theRecord then
			set tagList to tags of theRecord
			set end of tagList to typeTag
			set tags of theRecord to tagList
		end if
		set typeTag to |type| of chosenItem
		if typeTag is not in tags of theRecord then
			set tagList to tags of theRecord
			set end of tagList to typeTag
			set tags of theRecord to tagList
		end if
		-- title
		set recordTitle to item 1 of title of chosenItem
		my setPDFMetadata("Title", recordTitle, recordPath)
		-- author
		set authorString to my extractAuthor(chosenItem)
		my setPDFMetadata("Author", authorString, recordPath)
		-- DOI
		set doiString to my extractDOI(chosenItem)
		my setPDFMetadata("Subject", doiString, recordPath)
		add custom meta data doiString for "doi" to theRecord
		
	on error error_message number error_number
		if the error_number is not -128 then display alert "DEVONthink" message error_message as warning
	end try
end tell

on extractDate(record_item)
	try
		set [iyear, imonth, iday] to item 1 of |date-parts| of published of record_item
	on error
		set [iyear, imonth, iday] to item 1 of |date-parts| of created of record_item
	end try
	set dateString to ((iday as string) & "-" & imonth as string) & "-" & iyear as string
	return dateString
end extractDate

on extractDOI(record_item)
	try
		set doi_entries to doi of record_item
	on error
		set doi_entries to "missing"
	end try
	set doiString to doi_entries
	return doiString
end extractDOI

on extractAuthor(record_item)
	set firstAuthor to ""
	try
		set author_entries to author of record_item
	on error
		return ""
	end try
	repeat with a from 1 to length of author_entries
		set this_entry to item a of author_entries
		if sequence of this_entry is equal to "first" then
			set firstAuthor to this_entry
		end if
	end repeat
	if firstAuthor is equal to "" then
		return ""
	end if
	try
		set authorString to |given| of firstAuthor & " " & family of firstAuthor
	on error
		try
			set authorString to family of firstAuthor
		on error
			set authorString to |name| of firstAuthor
		end try
	end try
	return authorString
end extractAuthor

on setPDFMetadata(mdKey, mdValue, recordPath)
	set shellCommand to "/opt/homebrew/bin/exiftool -overwrite_original -" & mdKey & "=" & quoted form of mdValue & " " & quoted form of recordPath
	set exifresult to do shell script shellCommand
end setPDFMetadata

BLUEFROG · August 1, 2022, 1:23pm

Glad you got it worked out.

mnot · October 10, 2022, 6:20am

Just an update –

I’ve added the DOI URL in the ‘URL’ field, and stored a formatted citation in the finder comments; see the script for details.

I’ve also changed the User-Agent header to be friendlier to crossref.org (as they request); as a result, updating should make lookups faster and more reliable.