Hello,
Do some of you save Twitter messages on DT? If so, how do you do it?
I’m trying to do it myself, and I’m getting there, but I’m not entirely convinced that I’m using the best method, and above all, I can’t get the image from the tweet.
I’m exporting as rich text.
The only way I’ve found (and I’ve tried many ways) is to do a screen shot with the macOS screen capture tool (cmd-shift-5), save into the Global Inbox, then in DEVONthink have a Smart Rule that converts that jpg into a PDF. I copy the URL of the Twitter/X post into the URL metadata field.
Thank you, that’s very interesting. I hadn’t even thought of that!
The other method that sometimes work for me is to go to the “tweet” (rather than being on a list), and “print” to PDF (cmd-P). Sometimes the filename picked by the computer is mangled, but then fix it.
Thank you, but my usage primarily concerns my mobile phone, so DEVONthink To Go.
In my experience, I can also print on my mobile phone: Select “Share” in browser, select “Print” in share sheet, select “Share” in the print dialog and then save to DTTG. Three clicks, admittedly, but perhaps that can be shortened with a Shortcut. I never use those because the stuff seems to unreliable to me, though. Other people like it.
I’m trying to do this from the Twitter app and not from a browser, which may be the problem.
If the Twitter/X app has a share sheet, it should work the same. But I have no idea of that stuff and do not intend to ever use it.
I have checked carefully, and unfortunately it is not possible.
iOS screenshots should work regardless of the app.
Maybe my optimism about humans is misplaced with that “should”. I haven’t futzed with the xwitter app in ages and go to the gram only when work requires me to follow a link.
In the past I preferred archiving Twitter and Instagram from iOS as the resulting PDF looks more like what I think those platforms were designed for (tall vertical page in one column).
Saving from Twitter / X is really annoying, I have been doing it manually by copying-and-pasting things I find interesting from a browser into DEVONthink, but last week I started using AI tools and now have a custom script to run on my computer. It needs the Twitter cookies to be able to log in to the virtual browser, but once you have completed that step you just pass it a URL and it downloads the text and images and saves in RTFD (Rich Text Format with attachments) and send it to DEVONthink Inbox.
would you be willing to share the script with us? having a possibility to automatically download instagram and x feeds that have been shared with dttg would be great (currently I save only the bookmark).
#!/Users/user/.x2dt/venv/bin/python3
"""x2dt — Save X/Twitter articles and tweets to DEVONthink as RTFD.
Usage:
x2dt <tweet-or-article-url>
x2dt --setup # open browser to log into X (saves session)
"""
import argparse
import base64
import os
import re
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
from playwright.sync_api import sync_playwright
PROFILE_DIR = Path.home() / ".x2dt" / "browser_profile"
TIMEOUT_MS = 30_000
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def sanitize_filename(name: str, max_len: int = 80) -> str:
"""Strip filesystem-illegal chars and truncate."""
name = re.sub(r'[/\\:*?"<>|]', "_", name)
name = re.sub(r'\s+', " ", name).strip()
return name[:max_len] or "x2dt_import"
def is_article_url(url: str) -> bool:
parsed = urlparse(url)
return "/i/article" in parsed.path or "/i/articles" in parsed.path
def is_tweet_url(url: str) -> bool:
parsed = urlparse(url)
# e.g. x.com/username/status/12345 or twitter.com/...
return bool(re.search(r'/status/\d+', parsed.path))
# ---------------------------------------------------------------------------
# Playwright: load page
# ---------------------------------------------------------------------------
STEALTH_ARGS = [
"--disable-blink-features=AutomationControlled",
"--no-first-run",
"--no-default-browser-check",
]
# Removes navigator.webdriver so X can't fingerprint Playwright
STEALTH_INIT_SCRIPT = """
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
"""
def load_page(url: str, headless: bool = True) -> str:
"""Return full rendered HTML of the page."""
with sync_playwright() as pw:
ctx = pw.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=headless,
viewport={"width": 1280, "height": 900},
user_agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
locale="en-US",
ignore_default_args=["--enable-automation"],
args=STEALTH_ARGS,
)
ctx.add_init_script(STEALTH_INIT_SCRIPT)
page = ctx.new_page()
try:
page.goto(url, wait_until="networkidle", timeout=TIMEOUT_MS)
except Exception:
# networkidle can time-out on heavy pages; fall back to domcontentloaded
try:
page.goto(url, wait_until="domcontentloaded", timeout=TIMEOUT_MS)
time.sleep(3)
except Exception as exc:
ctx.close()
raise RuntimeError(f"Failed to load {url}: {exc}") from exc
# Extra wait for JS-rendered content
time.sleep(2)
html = page.content()
ctx.close()
return html
# ---------------------------------------------------------------------------
# Setup mode: log in to X
# ---------------------------------------------------------------------------
def cmd_setup() -> None:
print("Opening Chromium so you can log in to X/Twitter.")
print("Once logged in, come back here and press Enter to save the session.")
with sync_playwright() as pw:
ctx = pw.chromium.launch_persistent_context(
user_data_dir=str(PROFILE_DIR),
headless=False,
viewport={"width": 1280, "height": 900},
ignore_default_args=["--enable-automation"],
args=STEALTH_ARGS,
)
ctx.add_init_script(STEALTH_INIT_SCRIPT)
page = ctx.new_page()
page.goto("https://x.com/login")
try:
input("\nPress Enter here once you have logged in… ")
except EOFError:
# Non-interactive stdin (e.g. run via a tool) — wait for browser close
print("(waiting for browser window to be closed…)")
try:
page.wait_for_event("close", timeout=300_000)
except Exception:
pass
ctx.close()
print("Session saved. You can now run: x2dt <url>")
# ---------------------------------------------------------------------------
# Content extraction
# ---------------------------------------------------------------------------
def extract_article(soup: BeautifulSoup, url: str) -> tuple[str, str]:
"""Return (title, content_html) for an X Article."""
# Try progressively broader selectors
content_node = None
for selector in [
"article",
'[data-testid*="article"]',
'[data-testid="tweetDetail"]',
"main",
]:
content_node = soup.select_one(selector)
if content_node:
break
if content_node is None:
raise RuntimeError("Could not find article content in page HTML.")
# Title: prefer <h1> inside content, then page <title>
h1 = content_node.find("h1")
if h1:
title = h1.get_text(strip=True)
else:
title_tag = soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else "X Article"
return title, str(content_node)
def draftjs_to_html(rich_text_node: BeautifulSoup) -> str:
"""Convert DraftJS-rendered HTML (X long-form tweets) to clean semantic HTML."""
parts = []
pending_li: list[str] = [] # buffer consecutive list items
pending_li_ordered: bool = False # whether current list is ordered
def flush_list() -> None:
if pending_li:
wrap = "ol" if pending_li_ordered else "ul"
parts.append(f"<{wrap}>\n" + "\n".join(f" <li>{t}</li>" for t in pending_li) + f"\n</{wrap}>")
pending_li.clear()
for block in rich_text_node.find_all(attrs={"data-block": True}):
tag = block.name # "h1","h2","h3","h4","li","blockquote","section", or "div"
# Section blocks hold either an image or a code block — no span[data-text] content
if tag == "section":
# Image block
photo = block.find(attrs={"data-testid": "tweetPhoto"})
if photo:
img = photo.find("img")
if img:
src = img.get("src", "")
if src and "profile_images" not in src:
flush_list()
parts.append(f'<img src="{src}" alt="">')
continue
# Code block
code_block = block.find(attrs={"data-testid": "markdown-code-block"})
if code_block:
code_el = code_block.find("code")
if code_el:
# Escape HTML special chars so the code renders literally
code_text = (code_el.get_text()
.replace("&", "&")
.replace("<", "<")
.replace(">", ">"))
flush_list()
parts.append(f"<pre><code>{code_text}</code></pre>")
continue # nothing more to do for section blocks
inline_parts = []
for text_span in block.find_all("span", attrs={"data-text": True}):
text = text_span.get_text()
parent = text_span.parent
style = parent.get("style", "") if parent else ""
if "font-weight: bold" in style or "font-weight:bold" in style:
inline_parts.append(f"<strong>{text}</strong>")
elif "font-style: italic" in style or "font-style:italic" in style:
inline_parts.append(f"<em>{text}</em>")
else:
inline_parts.append(text)
inner = "".join(inline_parts).strip()
if not inner:
continue
if tag in ("h1", "h2", "h3", "h4"):
flush_list()
parts.append(f"<{tag}>{inner}</{tag}>")
elif tag == "li":
classes = " ".join(block.get("class", []))
is_ordered = "longform-ordered-list-item" in classes
# Flush if switching list type
if pending_li and is_ordered != pending_li_ordered:
flush_list()
pending_li_ordered = is_ordered
pending_li.append(inner)
elif tag == "blockquote":
flush_list()
parts.append(f"<blockquote>{inner}</blockquote>")
else:
flush_list()
parts.append(f"<p>{inner}</p>")
flush_list()
return "\n".join(parts)
def resolve_url(url: str) -> str:
"""Follow redirects (e.g. t.co) and return the final destination URL."""
try:
resp = requests.head(url, allow_redirects=True, timeout=5, headers=HEADERS)
return resp.url
except Exception:
try:
resp = requests.get(url, allow_redirects=True, timeout=5, headers=HEADERS, stream=True)
return resp.url
except Exception:
return url # fall back to original if resolution fails
def find_threadreader_url(soup: BeautifulSoup) -> str | None:
"""Return a clean threadreaderapp.com thread URL if one is linked on the page.
t.co uses JS redirects so we can't follow them with requests.
Instead, scan link text and aria-labels for the threadreaderapp URL directly.
"""
TR_PATTERN = re.compile(r'threadreaderapp\.com/thread/(\d+)')
def make_url(thread_id: str) -> str:
return f"https://threadreaderapp.com/thread/{thread_id}.html"
# Pass 1: direct threadreaderapp.com href
for a in soup.find_all("a", href=True):
m = TR_PATTERN.search(a["href"])
if m:
return make_url(m.group(1))
# Pass 2: threadreaderapp URL visible in link text or aria-label
for a in soup.find_all("a", href=True):
searchable = a.get_text() + " " + a.get("aria-label", "")
m = TR_PATTERN.search(searchable)
if m:
return make_url(m.group(1))
return None
def delegate_to_tr2dt(tr_url: str) -> None:
"""Hand off to tr2dt and exit with its return code."""
tr2dt = Path(__file__).parent / "tr2dt"
if not tr2dt.exists():
print(f"Error: tr2dt not found at {tr2dt}", file=sys.stderr)
sys.exit(1)
result = subprocess.run([str(tr2dt), tr_url])
sys.exit(result.returncode)
def tweettext_to_html(text_node: BeautifulSoup) -> str:
"""Convert a tweetText node to HTML, resolving t.co links and preserving line breaks."""
parts = []
for child in text_node.children:
if isinstance(child, NavigableString):
# Preserve line breaks as <br>, escape HTML
text = str(child).replace("&", "&").replace("<", "<").replace(">", ">")
text = text.replace("\n", "<br>")
parts.append(text)
elif isinstance(child, Tag):
if child.name == "a":
href = child.get("href", "")
# Skip links to tweet photos or internal X navigation
if "/photo/" in href or href.startswith("/"):
continue
full_url = resolve_url(href) if "t.co" in href else href
escaped = full_url.replace("&", "&")
parts.append(f'<a href="{escaped}">{escaped}</a>')
elif child.name == "img":
pass # emoji images — skip, the alt text is lost anyway
else:
# e.g. <span> — get its text, preserving line breaks
text = child.get_text().replace("&", "&").replace("<", "<").replace(">", ">")
text = text.replace("\n", "<br>")
parts.append(text)
return "".join(parts)
def extract_tweet(soup: BeautifulSoup, url: str) -> tuple[str, str]:
"""Return (title, content_html) for a tweet (short or long-form article)."""
# --- Long-form X Article embedded in a tweet ---
article_view = soup.select_one('[data-testid="twitterArticleReadView"]')
if article_view:
title_node = article_view.select_one('[data-testid="twitter-article-title"]')
title = title_node.get_text(strip=True) if title_node else ""
rich_text = article_view.select_one('[data-testid="twitterArticleRichTextView"]')
body_html = draftjs_to_html(rich_text) if rich_text else ""
# Cover image: the tweetPhoto that sits OUTSIDE twitterArticleRichTextView
# (inline images inside the rich text are handled by draftjs_to_html)
cover_html = ""
for photo in article_view.find_all(attrs={"data-testid": "tweetPhoto"}):
if rich_text and rich_text in photo.parents:
continue # inline image — draftjs_to_html handles it
img = photo.find("img")
if img:
src = img.get("src", "")
if src and "profile_images" not in src:
cover_html = f'<img src="{src}" alt="">\n'
break
if not title:
title_tag = soup.find("title")
title = title_tag.get_text(strip=True).split(" / X")[0] if title_tag else "Tweet"
content_html = cover_html + body_html
return title, content_html or "<p>(no content found)</p>"
# --- Regular short tweet ---
text_node = soup.select_one('[data-testid="tweetText"]')
tweet_text = text_node.get_text(separator=" ", strip=True) if text_node else ""
tweet_html = tweettext_to_html(text_node) if text_node else ""
author = ""
author_node = soup.select_one('[data-testid="User-Name"]')
if author_node:
author = author_node.get_text(strip=True).split("\n")[0]
if not author:
m = re.match(r'https?://(?:x|twitter)\.com/([^/]+)/', url)
if m:
author = "@" + m.group(1)
snippet = tweet_text[:60].rstrip()
if len(tweet_text) > 60:
snippet += "…"
title = f"{author} – {snippet}" if author else snippet or "Tweet"
parts = []
if tweet_html:
parts.append(f"<p>{tweet_html}</p>")
for img_node in soup.select('[data-testid="tweetPhoto"] img'):
src = img_node.get("src", "")
if src and "profile_images" not in src:
parts.append(f'<img src="{src}" alt="tweet photo">')
content_html = "\n".join(parts) if parts else "<p>(no content found)</p>"
return title, content_html
# ---------------------------------------------------------------------------
# HTML cleaning
# ---------------------------------------------------------------------------
REMOVE_TAGS = {"nav", "header", "footer", "script", "style", "aside", "noscript"}
AD_TESTIDS = {
"promoted-tweet", "placementTracking", "ad", "inline-card",
}
def clean_html(html: str, base_url: str) -> BeautifulSoup:
"""Remove chrome/ads, make image URLs absolute."""
soup = BeautifulSoup(html, "lxml")
# Remove noisy tags entirely
for tag in REMOVE_TAGS:
for node in soup.find_all(tag):
node.decompose()
# Remove common ad/promo elements
for node in soup.find_all(attrs={"data-testid": True}):
testid = node.get("data-testid", "").lower()
if any(ad in testid for ad in AD_TESTIDS):
node.decompose()
# Make image src absolute
for img in soup.find_all("img"):
src = img.get("src", "")
if src and not src.startswith("data:"):
img["src"] = urljoin(base_url, src)
return soup
# ---------------------------------------------------------------------------
# Image embedding
# ---------------------------------------------------------------------------
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "https://x.com/",
}
def embed_images(soup: BeautifulSoup) -> None:
"""Download images and replace src with base64 data URIs (in-place)."""
for img in soup.find_all("img"):
src = img.get("src", "")
if not src or src.startswith("data:"):
continue
try:
resp = requests.get(src, headers=HEADERS, timeout=10)
resp.raise_for_status()
content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0]
b64 = base64.b64encode(resp.content).decode("ascii")
img["src"] = f"data:{content_type};base64,{b64}"
except Exception as exc:
print(f" [warn] Could not embed image {src[:80]}: {exc}", file=sys.stderr)
# ---------------------------------------------------------------------------
# Wrap in full HTML document
# ---------------------------------------------------------------------------
def wrap_html(title: str, body_html: str) -> str:
return f"""<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{title}</title>
<style>
body {{ font-family: -apple-system, Helvetica, sans-serif; max-width: 800px;
margin: 2em auto; line-height: 1.6; color: #111; }}
img {{ max-width: 100%; height: auto; display: block; margin: 1em 0; }}
h1, h2, h3 {{ font-weight: 600; }}
blockquote {{ border-left: 3px solid #ccc; margin-left: 0; padding-left: 1em;
color: #555; }}
</style>
</head>
<body>
<h1>{title}</h1>
{body_html}
</body>
</html>
"""
# ---------------------------------------------------------------------------
# textutil: HTML → RTFD
# ---------------------------------------------------------------------------
def html_to_rtfd(html: str, title: str) -> Path:
"""Write HTML to a temp file, convert to RTFD, return RTFD path."""
safe_title = sanitize_filename(title)
tmp_html = Path(tempfile.gettempdir()) / "x2dt_content.html"
rtfd_path = Path(tempfile.gettempdir()) / f"{safe_title}.rtfd"
# Remove any existing RTFD bundle (textutil won't overwrite)
if rtfd_path.exists():
subprocess.run(["rm", "-rf", str(rtfd_path)], check=True)
tmp_html.write_text(html, encoding="utf-8")
result = subprocess.run(
["textutil", "-convert", "rtfd", "-output", str(rtfd_path), str(tmp_html)],
capture_output=True, text=True
)
if result.returncode != 0:
print(f"textutil error:\n{result.stderr}", file=sys.stderr)
print(f"HTML temp file left at: {tmp_html}", file=sys.stderr)
raise RuntimeError("textutil conversion failed.")
tmp_html.unlink(missing_ok=True)
return rtfd_path
# ---------------------------------------------------------------------------
# DEVONthink import via AppleScript
# ---------------------------------------------------------------------------
def import_to_devonthink(rtfd_path: Path, title: str, url: str) -> None:
"""Import RTFD into DEVONthink inbox and set its name and URL."""
escaped_path = str(rtfd_path).replace('"', '\\"')
escaped_title = title.replace('"', '\\"').replace("\\", "\\\\")
escaped_url = url.replace('"', '\\"')
script = f'''
tell application "DEVONthink"
set theRecords to import "{escaped_path}"
if (count of theRecords) > 0 then
set theRecord to item 1 of theRecords
set name of theRecord to "{escaped_title}"
set URL of theRecord to "{escaped_url}"
end if
end tell
'''
result = subprocess.run(
["osascript", "-e", script],
capture_output=True, text=True
)
if result.returncode != 0:
print(f"AppleScript error:\n{result.stderr}", file=sys.stderr)
raise RuntimeError("DEVONthink import failed.")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Save X/Twitter articles and tweets to DEVONthink as RTFD.",
add_help=True,
)
parser.add_argument(
"url", nargs="?",
help="X/Twitter article or tweet URL"
)
parser.add_argument(
"--setup", action="store_true",
help="Open browser to log in to X (saves session for future runs)"
)
args = parser.parse_args()
if args.setup:
cmd_setup()
return
if not args.url:
parser.print_help()
sys.exit(1)
url = args.url
# If the URL itself is a threadreaderapp.com link, delegate immediately
if "threadreaderapp.com/thread/" in url:
print(f"Detected: ThreadReader URL → delegating to tr2dt", flush=True)
delegate_to_tr2dt(url)
print(f"Fetching: {url}", flush=True)
# Step A: load page
try:
html = load_page(url, headless=True)
except RuntimeError as exc:
print(f"Error: {exc}", file=sys.stderr)
sys.exit(1)
# Step B: detect type and extract
soup_raw = BeautifulSoup(html, "lxml")
# Check if the page contains a link to ThreadReader — delegate if so
tr_url = find_threadreader_url(soup_raw)
if tr_url:
print(f"Detected: ThreadReader link → {tr_url}", flush=True)
delegate_to_tr2dt(tr_url)
if is_article_url(url):
print("Detected: X Article")
try:
title, content_html = extract_article(soup_raw, url)
except RuntimeError as exc:
print(f"Error: {exc}", file=sys.stderr)
sys.exit(1)
else:
# Default: tweet
print("Detected: Tweet")
title, content_html = extract_tweet(soup_raw, url)
if not content_html.strip():
print("Error: no content extracted from page.", file=sys.stderr)
sys.exit(1)
print(f"Title: {title}")
# Step C: clean HTML
content_soup = clean_html(content_html, base_url=url)
# Step D: embed images
print("Embedding images…")
embed_images(content_soup)
# Build final HTML document
final_html = wrap_html(title, str(content_soup))
# Step E: convert to RTFD
print("Converting to RTFD…")
try:
rtfd_path = html_to_rtfd(final_html, title)
except RuntimeError:
sys.exit(1)
print(f"RTFD created: {rtfd_path}")
# Step F+G: import to DEVONthink
print("Importing to DEVONthink…")
try:
import_to_devonthink(rtfd_path, title, url)
except RuntimeError:
print(f"RTFD left at: {rtfd_path}", file=sys.stderr)
sys.exit(1)
# Step H: cleanup
subprocess.run(["rm", "-rf", str(rtfd_path)], check=False)
print(f'Done. \u201c{title}\u201d is in your DEVONthink inbox.')
if __name__ == "__main__":
main()
there is the x2dt script, next is a similar script designed to work on twitter threads from the threadreader website.
Note - both scripts were generated by AI - Claude Code - so please use them at your own risk.
#!/Users/user/.x2dt/venv/bin/python3
"""tr2dt — Save a ThreadReader thread to DEVONthink as RTFD.
Usage:
tr2dt <https://threadreaderapp.com/thread/...url>
"""
import argparse
import base64
import re
import subprocess
import sys
import tempfile
from pathlib import Path
from urllib.parse import urlparse, urlunparse
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
IMG_HEADERS = {
"User-Agent": HEADERS["User-Agent"],
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "https://threadreaderapp.com/",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def sanitize_filename(name: str, max_len: int = 80) -> str:
name = re.sub(r'[/\\:*?"<>|]', "_", name)
name = re.sub(r'\s+', " ", name).strip()
return name[:max_len] or "tr2dt_import"
def highres_url(url: str) -> str:
"""Return the original-resolution version of a pbs.twimg.com image URL."""
if not url or "pbs.twimg.com" not in url:
return url
parsed = urlparse(url)
return urlunparse(parsed._replace(query="name=orig"))
# ---------------------------------------------------------------------------
# Fetch page
# ---------------------------------------------------------------------------
def fetch_page(url: str) -> str:
resp = requests.get(url, headers=HEADERS, timeout=30)
resp.raise_for_status()
resp.encoding = "utf-8" # threadreader serves UTF-8; override any mis-detected charset
return resp.text
# ---------------------------------------------------------------------------
# Extract thread content
# ---------------------------------------------------------------------------
def tweet_to_html(div: Tag) -> str:
"""Convert a single content-tweet div to clean HTML fragment."""
parts = []
for child in div.children:
if isinstance(child, NavigableString):
text = (str(child)
.replace("&", "&")
.replace("<", "<")
.replace(">", ">"))
parts.append(text)
continue
if not isinstance(child, Tag):
continue
tag = child.name
classes = " ".join(child.get("class", []))
# Permalink icon — skip
if tag == "sup" and "tw-permalink" in classes:
continue
# Image
if tag == "span" and "entity-image" in classes:
a = child.find("a")
img = child.find("img")
if img:
raw_src = (a.get("href") if a else None) or img.get("data-src", "")
src = highres_url(raw_src)
if src:
parts.append(f'\n<img src="{src}" alt="">\n')
continue
# Embedded tweet (blockquote) — skip
if tag == "span" and "entity-embed" in classes:
continue
# Hyperlink
if tag == "a" and "entity-url" in classes:
href = child.get("href", "")
display = child.get_text(strip=True) or href
esc_href = href.replace("&", "&")
esc_display = display.replace("&", "&")
parts.append(f'<a href="{esc_href}">{esc_display}</a>')
continue
# Line break
if tag == "br":
parts.append("<br>")
continue
# Everything else (spans, nop markers, etc.) — extract text
text = (child.get_text()
.replace("&", "&")
.replace("<", "<")
.replace(">", ">"))
parts.append(text)
return "".join(parts).strip()
def extract_thread(soup: BeautifulSoup, source_url: str) -> tuple[str, str, str]:
"""Return (title, original_tweet_url, content_html)."""
# Original tweet URL from thread-info
original_url = source_url
ti = soup.find("div", class_="thread-info")
if ti:
time_link = ti.find("a", class_="time")
if time_link and time_link.get("href"):
original_url = time_link["href"]
# Author from first tweet's data-screenname
first = soup.find("div", id="tweet_1")
author = f'@{first["data-screenname"]}' if first and first.get("data-screenname") else ""
# Title: og:title meta first, then construct from author + first tweet text
og = soup.find("meta", property="og:title")
if og and og.get("content") and "Thread Reader" not in og["content"]:
title = og["content"].strip()
elif first:
snippet = first.get_text(separator=" ", strip=True)[:80].rstrip()
if len(first.get_text(strip=True)) > 80:
snippet += "…"
title = f"{author} – {snippet}" if author else snippet
else:
title = f"Thread by {author}" if author else "Thread"
# Collect only numbered tweet divs (tweet_1 … tweet_N)
numbered = soup.find_all(
"div",
id=re.compile(r'^tweet_\d+$'),
class_="content-tweet"
)
if not numbered:
raise RuntimeError("No numbered tweet blocks found in page.")
# Build HTML: one <div class="tweet-block"> per tweet, separated by <hr>
blocks = []
for div in numbered:
html_fragment = tweet_to_html(div)
if html_fragment:
blocks.append(f'<div class="tweet-block">{html_fragment}</div>')
content_html = "\n<hr>\n".join(blocks)
return title, original_url, content_html
# ---------------------------------------------------------------------------
# Image embedding
# ---------------------------------------------------------------------------
def embed_images(soup: BeautifulSoup) -> None:
"""Download images and replace src with base64 data URIs (in-place)."""
for img in soup.find_all("img"):
src = img.get("src", "")
if not src or src.startswith("data:"):
continue
try:
resp = requests.get(src, headers=IMG_HEADERS, timeout=15)
resp.raise_for_status()
content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
b64 = base64.b64encode(resp.content).decode("ascii")
img["src"] = f"data:{content_type};base64,{b64}"
except Exception as exc:
print(f" [warn] Could not embed image {src[:80]}: {exc}", file=sys.stderr)
# ---------------------------------------------------------------------------
# HTML document wrapper
# ---------------------------------------------------------------------------
def wrap_html(title: str, body_html: str) -> str:
return f"""<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{title}</title>
<style>
body {{ font-family: -apple-system, Helvetica, sans-serif; max-width: 800px;
margin: 2em auto; line-height: 1.6; color: #111; }}
img {{ max-width: 100%; height: auto; display: block; margin: 1em 0; }}
hr {{ border: none; border-top: 1px solid #ddd; margin: 1.5em 0; }}
.tweet-block {{ margin-bottom: 0.5em; }}
a {{ color: #1d9bf0; }}
blockquote {{ border-left: 3px solid #ccc; margin-left: 0;
padding-left: 1em; color: #555; }}
</style>
</head>
<body>
<h1>{title}</h1>
{body_html}
</body>
</html>
"""
# ---------------------------------------------------------------------------
# textutil: HTML → RTFD
# ---------------------------------------------------------------------------
def html_to_rtfd(html: str, title: str) -> Path:
safe_title = sanitize_filename(title)
tmp_html = Path(tempfile.gettempdir()) / "tr2dt_content.html"
rtfd_path = Path(tempfile.gettempdir()) / f"{safe_title}.rtfd"
if rtfd_path.exists():
subprocess.run(["rm", "-rf", str(rtfd_path)], check=True)
tmp_html.write_text(html, encoding="utf-8")
result = subprocess.run(
["textutil", "-convert", "rtfd", "-output", str(rtfd_path), str(tmp_html)],
capture_output=True, text=True
)
if result.returncode != 0:
print(f"textutil error:\n{result.stderr}", file=sys.stderr)
print(f"HTML temp file left at: {tmp_html}", file=sys.stderr)
raise RuntimeError("textutil conversion failed.")
tmp_html.unlink(missing_ok=True)
return rtfd_path
# ---------------------------------------------------------------------------
# DEVONthink import
# ---------------------------------------------------------------------------
def import_to_devonthink(rtfd_path: Path, title: str, url: str) -> None:
escaped_path = str(rtfd_path).replace('"', '\\"')
escaped_title = title.replace('"', '\\"').replace("\\", "\\\\")
escaped_url = url.replace('"', '\\"')
script = f'''
tell application "DEVONthink"
set theRecords to import "{escaped_path}"
if (count of theRecords) > 0 then
set theRecord to item 1 of theRecords
set name of theRecord to "{escaped_title}"
set URL of theRecord to "{escaped_url}"
end if
end tell
'''
result = subprocess.run(
["osascript", "-e", script],
capture_output=True, text=True
)
if result.returncode != 0:
print(f"AppleScript error:\n{result.stderr}", file=sys.stderr)
raise RuntimeError("DEVONthink import failed.")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Save a ThreadReader thread to DEVONthink as RTFD."
)
parser.add_argument("url", help="ThreadReader URL (threadreaderapp.com/thread/...)")
args = parser.parse_args()
url = args.url
if "threadreaderapp.com" not in url:
print("Error: URL must be a threadreaderapp.com URL.", file=sys.stderr)
sys.exit(1)
print(f"Fetching: {url}")
try:
html = fetch_page(url)
except Exception as exc:
print(f"Error fetching page: {exc}", file=sys.stderr)
sys.exit(1)
soup = BeautifulSoup(html, "lxml")
print("Extracting thread…")
try:
title, original_url, content_html = extract_thread(soup, url)
except RuntimeError as exc:
print(f"Error: {exc}", file=sys.stderr)
sys.exit(1)
print(f"Title: {title}")
# Parse content HTML so we can embed images
content_soup = BeautifulSoup(content_html, "lxml")
print("Downloading images…")
embed_images(content_soup)
final_html = wrap_html(title, str(content_soup))
print("Converting to RTFD…")
try:
rtfd_path = html_to_rtfd(final_html, title)
except RuntimeError:
sys.exit(1)
print(f"RTFD created: {rtfd_path}")
print("Importing to DEVONthink…")
try:
import_to_devonthink(rtfd_path, title, original_url)
except RuntimeError:
print(f"RTFD left at: {rtfd_path}", file=sys.stderr)
sys.exit(1)
subprocess.run(["rm", "-rf", str(rtfd_path)], check=False)
print(f'Done. \u201c{title}\u201d is in your DEVONthink inbox.')
if __name__ == "__main__":
main()