sync(medium): strip stat tracker and resolve gist embeds
authorxangelo <me@xangelo.ca>
Tue, 22 Jul 2025 03:43:53 +0000 (23:43 -0400)
committerxangelo <me@xangelo.ca>
Tue, 22 Jul 2025 03:43:53 +0000 (23:43 -0400)
.github/scripts/medium_to_hugo.py

index 6840df66a1f2cdc943ebaf1e75d11cbbe1ba29b1..9e0b1f33d850faf8ba60848e5de7f1548fd5e791 100644 (file)
@@ -5,6 +5,8 @@ import frontmatter
 from markdownify import markdownify as md
 import html
 from datetime import datetime
+import requests
+from urllib.parse import urlparse
 
 RSS_URL = "https://medium.com/feed/@xangelo"
 OUTPUT_DIR = "content/posts/medium"
@@ -13,6 +15,36 @@ EXISTING_SLUGS = {f[:-3] for f in os.listdir(OUTPUT_DIR) if f.endswith(".md")}
 def slugify(title):
     return re.sub(r"[^\w-]", "", re.sub(r"\s+", "-", title.lower())).strip("-")
 
+def resolve_medium_media_links(content):
+    """
+    Find Medium media links in the content and resolve them to direct GitHub Gist URLs if applicable.
+    Medium media links look like: <https://medium.com/media/c7634bd7099d8b4a3c68e75789d29869/href>
+    """
+    # Pattern to match Medium media links
+    medium_media_pattern = r'<(https://medium\.com/media/[a-f0-9]+/href)>'
+    
+    def replace_link(match):
+        medium_url = match.group(1)
+        try:
+            # Follow the Medium media link to see where it redirects
+            response = requests.get(medium_url, allow_redirects=True, timeout=10)
+            final_url = response.url
+            
+            # Check if the final URL is a GitHub Gist
+            parsed_url = urlparse(final_url)
+            if parsed_url.netloc == 'gist.github.com':
+                print(f"Resolved Medium media link: {medium_url} -> {final_url}")
+                return f"<script src=\"{final_url}\"></script>"
+            else:
+                print(f"Medium media link does not resolve to GitHub Gist: {medium_url} -> {final_url}")
+                return match.group(0)  # Return original if not a gist
+                
+        except requests.RequestException as e:
+            print(f"Failed to resolve Medium media link {medium_url}: {e}")
+            return match.group(0)  # Return original on error
+    
+    return re.sub(medium_media_pattern, replace_link, content)
+
 feed = feedparser.parse(RSS_URL)
 
 for entry in feed.entries:
@@ -30,6 +62,14 @@ for entry in feed.entries:
     post["draft"] = False
     post["medium_link"] = entry.link
 
+    # Resolve Medium media links to GitHub Gists
+    post["content"] = resolve_medium_media_links(post["content"])
+
+    # the last line of a post is a stat line that looks like this:
+    # ![](https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=ca9ab4d5b529)
+    # we should strip these out so that they don't count towards the viewer count on medium
+    post["content"] = post["content"].replace("![](https://medium.com/_/stat?event=post.clientViewed&referrerSource=full_rss&postId=ca9ab4d5b529)", "")
+
     # add a line to the bottom of the post to indicate that it's from Medium
     post["content"] += "\n\n---\n\nThis was originally published on Medium - " + entry.link