So I’ve gone with the following script which is now working exactly as I want it to (though may be a bit convoluted??):
property pAPIKey : "..."
property pModel : "gpt-4-turbo"
property pTargetDatabase : "Feeds"
on performSmartRule(theRecords)
tell application id "DNtp"
repeat with theRecord in theRecords
try
-- Get content
set theHTML to source of theRecord
set theTitle to name of theRecord
-- Get the creation date of the original record
set originalDate to creation date of theRecord
-- Format date using AppleScript's date formatting (YY instead of YYYY)
set yearStr to (year of originalDate as string)
set yearStr to rich texts 3 thru 4 of yearStr -- Get last 2 digits of year
set monthStr to (month of originalDate as integer) as string
if length of monthStr is 1 then set monthStr to "0" & monthStr
set dayStr to day of originalDate as string
if length of dayStr is 1 then set dayStr to "0" & dayStr
set hourStr to hours of originalDate as string
if length of hourStr is 1 then set hourStr to "0" & hourStr
set minuteStr to minutes of originalDate as string
if length of minuteStr is 1 then set minuteStr to "0" & minuteStr
set dateString to yearStr & monthStr & dayStr & "_" & hourStr & minuteStr
-- Get the feed title from the parent group
set feedTitle to ""
try
set parentGroup to parent 1 of theRecord
if parentGroup is not missing value and parentGroup is not trash group then
set feedTitle to name of parentGroup
end if
on error
set feedTitle to ""
end try
-- Log for debugging
log message "Processing: " & theTitle
log message " Date: " & dateString
if feedTitle is not "" then
log message " From feed: " & feedTitle
end if
-- Create a Python script that handles everything
set pythonScript to "#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import json
import urllib.request
import urllib.parse
import html
import re
api_key = '" & pAPIKey & "'
model = '" & pModel & "'
# Get the HTML content from file
with open(sys.argv[1], 'r', encoding='utf-8') as f:
html_content = f.read()
# Extract text from HTML preserving structure
from html.parser import HTMLParser
class HTMLTextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text_parts = []
self.current_text = []
self.in_script = False
self.in_style = False
def handle_starttag(self, tag, attrs):
if tag == 'script':
self.in_script = True
elif tag == 'style':
self.in_style = True
elif tag in ['p', 'div', 'br']:
if self.current_text:
self.text_parts.append(' '.join(self.current_text).strip())
self.current_text = []
elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if self.current_text:
self.text_parts.append(' '.join(self.current_text).strip())
self.current_text = []
def handle_endtag(self, tag):
if tag == 'script':
self.in_script = False
elif tag == 'style':
self.in_style = False
elif tag in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if self.current_text:
self.text_parts.append(' '.join(self.current_text).strip())
self.current_text = []
def handle_data(self, data):
if not self.in_script and not self.in_style:
cleaned = data.strip()
if cleaned:
self.current_text.append(cleaned)
def get_text(self):
if self.current_text:
self.text_parts.append(' '.join(self.current_text).strip())
return '\\n\\n'.join([p for p in self.text_parts if p])
def sanitize_filename(filename):
'''Clean filename to avoid filesystem issues'''
# Remove HTML entities first
filename = html.unescape(filename)
# Replace problematic characters with safe alternatives
replacements = {
'/': '-',
'\\\\': '-',
':': '-',
'*': '',
'?': '',
'\"': '',
'<': '',
'>': '',
'|': '-',
'&': 'and',
'#': '',
'%': '',
'{': '(',
'}': ')',
'$': '',
'!': '',
'@': 'at',
'+': 'plus',
'`': '',
'=': '-',
'[': '(',
']': ')',
';': '-',
'\\'': '',
',': '',
'~': '-'
}
for old, new in replacements.items():
filename = filename.replace(old, new)
# Replace multiple spaces with single space
filename = ' '.join(filename.split())
# Replace periods with spaces (except for the last one if it exists)
parts = filename.split('.')
if len(parts) > 1:
# Keep everything except the last part, replace dots with spaces
filename = ' '.join(parts[:-1]) + '.' + parts[-1]
else:
filename = parts[0]
# Remove any remaining periods that aren't followed by an extension
filename = re.sub(r'\\.(?![a-zA-Z]{2,4}$)', ' ', filename)
# Replace multiple spaces/dashes with single ones
filename = re.sub(r'\\s+', ' ', filename)
filename = re.sub(r'-+', '-', filename)
# Remove leading/trailing spaces and dashes
filename = filename.strip(' -')
# Ensure filename isn't empty
if not filename or filename == '.':
filename = 'Untitled'
# Limit length (leave room for date prefix and extension)
if len(filename) > 180:
filename = filename[:180].strip()
return filename
parser = HTMLTextExtractor()
parser.feed(html_content)
text_content = parser.get_text()
# Get title from argument
title = sys.argv[2] if len(sys.argv) > 2 else 'Document'
# Prepare the API request
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}'
}
try:
# First, detect language - be more specific about the languages we're looking for
lang_data = {
'model': model,
'messages': [
{'role': 'system', 'content': 'You are a language detection expert. Reply with ONLY one of these exact words: English, Somali, Tigrinya, Amharic, or Unknown'},
{'role': 'user', 'content': f'What language is this text? Choose from: English, Somali, Tigrinya, Amharic. Text: \"{text_content[:500]}\"'}
],
'temperature': 0.1,
'max_tokens': 10
}
req = urllib.request.Request(
'https://api.openai.com/v1/chat/completions',
data=json.dumps(lang_data).encode('utf-8'),
headers=headers
)
response = urllib.request.urlopen(req)
result = json.loads(response.read().decode('utf-8'))
detected_language = 'Unknown'
if 'choices' in result and len(result['choices']) > 0:
detected_language = result['choices'][0]['message']['content'].strip()
# Normalize the detected language
detected_language_lower = detected_language.lower()
# Check if translation is needed (anything except English)
needs_translation = detected_language_lower not in ['english', 'unknown']
if needs_translation:
# Translate content
content_data = {
'model': model,
'messages': [
{'role': 'system', 'content': f'You are a professional {detected_language} to English translator. Translate the text while preserving paragraph breaks.'},
{'role': 'user', 'content': f'Translate this {detected_language} text to English, preserving all paragraph breaks: {text_content}'}
],
'temperature': 0.3,
'max_tokens': 4000
}
req = urllib.request.Request(
'https://api.openai.com/v1/chat/completions',
data=json.dumps(content_data).encode('utf-8'),
headers=headers
)
response = urllib.request.urlopen(req, timeout=120)
result = json.loads(response.read().decode('utf-8'))
if 'choices' in result and len(result['choices']) > 0:
translated_content = result['choices'][0]['message']['content']
else:
translated_content = 'Translation failed'
# Translate title
title_data = {
'model': model,
'messages': [
{'role': 'system', 'content': f'You are a {detected_language} to English translator. Provide ONLY the English translation with no explanations, quotes, or additional text.'},
{'role': 'user', 'content': f'Translate to English: {title}'}
],
'temperature': 0.3,
'max_tokens': 100
}
req2 = urllib.request.Request(
'https://api.openai.com/v1/chat/completions',
data=json.dumps(title_data).encode('utf-8'),
headers=headers
)
response2 = urllib.request.urlopen(req2)
result2 = json.loads(response2.read().decode('utf-8'))
if 'choices' in result2 and len(result2['choices']) > 0:
translated_title = result2['choices'][0]['message']['content'].strip()
# Clean up the title
translated_title = translated_title.strip('\"').strip('\\'').strip()
else:
translated_title = title
# Sanitize the filename
translated_title = sanitize_filename(translated_title)
# Create bilingual markdown output - using generic ORIGINAL TEXT header
output_markdown = f'''# {translated_title}
{translated_content}
---
## ORIGINAL TEXT
# {title}
{text_content}'''
else:
# Content is already in English or Unknown, just convert to markdown
translated_title = sanitize_filename(title)
output_markdown = f'''# {title}
{text_content}'''
# Output results for AppleScript parsing only
print('TITLE:' + translated_title)
print('CONTENT:' + output_markdown)
print('LANG:' + detected_language)
print('TRANSLATED:' + ('YES' if needs_translation else 'NO'))
except Exception as e:
if 'maximum context length' in str(e):
print('ERROR:Content too long. Consider processing in chunks.')
else:
print(f'ERROR:{str(e)}')
sys.exit(1)
"
-- Save Python script
set pythonFile to (POSIX path of (path to temporary items)) & "translate_feeds.py"
try
set fileRef to open for access pythonFile with write permission
set eof fileRef to 0
write pythonScript to fileRef as «class utf8»
close access fileRef
on error
try
close access fileRef
end try
end try
-- Save HTML content to file
set htmlFile to (POSIX path of (path to temporary items)) & "content.html"
try
set fileRef to open for access htmlFile with write permission
set eof fileRef to 0
write theHTML to fileRef as «class utf8»
close access fileRef
on error
try
close access fileRef
end try
end try
-- Run Python script
set scriptResult to do shell script "python3 " & quoted form of pythonFile & " " & quoted form of htmlFile & " " & quoted form of theTitle
-- Check for errors
if scriptResult starts with "ERROR:" then
error (rich texts 7 thru -1 of scriptResult)
end if
-- Parse results carefully to avoid including debug info
set AppleScript's text item delimiters to "CONTENT:"
set resultParts to text items of scriptResult
set translatedTitle to rich texts 7 thru -1 of (item 1 of resultParts) -- Remove "TITLE:"
-- Extract just the content part, stopping before LANG:
set contentPart to item 2 of resultParts
if contentPart contains "LANG:" then
set AppleScript's text item delimiters to "LANG:"
set contentOnlyParts to text items of contentPart
set translatedContent to item 1 of contentOnlyParts
else
set translatedContent to contentPart
end if
-- Additional AppleScript sanitization for title
set AppleScript's text item delimiters to ""
-- Remove any remaining problematic characters
set badChars to {":", "/", "\\", "*", "?", "\"", "<", ">", "|"}
repeat with badChar in badChars
set AppleScript's text item delimiters to badChar
set titleParts to text items of translatedTitle
set AppleScript's text item delimiters to "-"
set translatedTitle to titleParts as string
end repeat
-- Reset delimiter
set AppleScript's text item delimiters to ""
-- Final trim
set translatedTitle to do shell script "echo " & quoted form of translatedTitle & " | xargs"
-- Ensure no double periods before extension
if translatedTitle contains ".." then
set AppleScript's text item delimiters to ".."
set titleParts to text items of translatedTitle
set AppleScript's text item delimiters to "."
set translatedTitle to titleParts as string
set AppleScript's text item delimiters to ""
end if
-- Prepend the date to the filename
set finalFilename to dateString & " " & translatedTitle
-- Check if we have language info (for logging only)
set detectedLang to "unknown"
set wasTranslated to false
if scriptResult contains "LANG:" then
set AppleScript's text item delimiters to "LANG:"
set langParts to text items of scriptResult
set langInfo to item 2 of langParts
if langInfo contains "TRANSLATED:" then
set AppleScript's text item delimiters to "TRANSLATED:"
set langSubParts to text items of langInfo
set detectedLang to rich texts 1 thru -1 of item 1 of langSubParts
if item 2 of langSubParts contains "YES" then
set wasTranslated to true
end if
else
set detectedLang to langInfo
end if
end if
set AppleScript's text item delimiters to ""
-- Create a new markdown record with date-prefixed filename
set mdRecord to create record with {name:finalFilename, type:markdown, content:translatedContent} in parent 1 of theRecord
-- Copy all metadata
set URL of mdRecord to URL of theRecord
-- Build tags list: existing tags plus feed title
set existingTags to tags of theRecord
if feedTitle is not "" and feedTitle is not missing value then
if existingTags is missing value or existingTags is {} then
set tags of mdRecord to {feedTitle}
else
set tags of mdRecord to existingTags & {feedTitle}
end if
else
if existingTags is not missing value then
set tags of mdRecord to existingTags
end if
end if
set creation date of mdRecord to creation date of theRecord
set modification date of mdRecord to current date
-- Move to target database
set moveSuccess to false
try
-- Get all databases
set allDatabases to databases
set targetDB to missing value
-- Find the target database
repeat with aDatabase in allDatabases
if name of aDatabase is pTargetDatabase then
set targetDB to aDatabase
exit repeat
end if
end repeat
if targetDB is not missing value then
-- Move the record
move record mdRecord to incoming group of targetDB
set moveSuccess to true
log message "Successfully moved to database: " & pTargetDatabase
else
log message "Target database '" & pTargetDatabase & "' not found."
end if
on error moveError
log message "Move error: " & moveError
end try
-- Delete the original HTML record
try
delete record theRecord
log message "Deleted original HTML file"
on error deleteError
log message "Could not delete original: " & deleteError
end try
-- Clean up temp files
try
do shell script "rm -f " & quoted form of pythonFile & " " & quoted form of htmlFile
end try
if wasTranslated then
log message "Successfully translated: " & theTitle & " → " & finalFilename & " (Language: " & detectedLang & ")"
else
log message "Successfully converted: " & theTitle & " → " & finalFilename & " (Language: " & detectedLang & ")"
end if
on error errMsg
log message "Error processing '" & theTitle & "': " & errMsg
end try
end repeat
end tell
end performSmartRule