Python script to automate static Hugo site post images

It’s good to have a general post image on content sites, as it gives more of an identity to each page, and works well for social media sharing images.

While NotesToSelf.dev doesn’t tend to have images, my Chinese learning site and satirical tech news site have an image for each post.

Unsplash is a nice source of general royalty-free images that you can use for this if you give attribution.

When using Unsplash, I set up the url to the Unsplash page (not the image itself) in the frontmatter of each post:

---
title: Foobar
unsplash_url: https://unsplash.com/photos/gKXKBY-C-Dk
---

Then I have a Python script that runs through the post files checking for the unsplash_url field and doing the following:

After running the script, the image file is at ./static/img/2020/02/manja-vitolic-gKXKBY-C-Dk-unsplash.jpg in the repo, and the post file looks like this:

---
title: Foobar
unsplash_url: https://unsplash.com/photos/gKXKBY-C-Dk
image: /img/2020/02/manja-vitolic-gKXKBY-C-Dk-unsplash.jpg
---

{{< figure src="/img/2020/02/manja-vitolic-gKXKBY-C-Dk-unsplash.jpg"
           attr="Photo by Manja Vitolic on Unsplash"
           attrlink="https://unsplash.com/photos/gKXKBY-C-Dk" >}}

That uses Hugo’s built-in figure shortcode to render the image as an HTML5 <figure> element with an attribution link back to Unsplash.

The script looks like this:

#!/usr/bin/python3

import argparse
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from bs4.element import PageElement
import shutil
import re
import os
from datetime import date
import pathlib
import frontmatter
import glob
from multiprocessing import Pool


dirname = os.path.dirname(__file__)


def set_image_for_post(post_path: str):
    post_path = sanitise_post_path(post_path)
    post = frontmatter.load(post_path)

    if "image" in post.metadata:
        return

    if "unsplash_url" not in post.metadata:
        print("No unsplash_url set on post", post_path)
        return

    unsplash_url = str(post["unsplash_url"]).strip()

    html = unsplash_image_page_html(unsplash_url)
    soup = BeautifulSoup(html, "html.parser")

    author_name = extract_author(soup)
    print({"author_name": author_name})

    image_id = extract_image_id(unsplash_url)
    print({"image_id": image_id})

    image_server_path = make_image_server_path(post.metadata["date"], author_name, image_id)
    print({"image_server_path": image_server_path})

    image_local_path = make_image_local_path(image_server_path)
    print({"image_local_path": image_local_path})

    download_url = extract_download_url(soup)
    print({"download_url": download_url})

    download_image_file(download_url, image_local_path)

    post.metadata["image"] = image_server_path

    image_shortcode = make_image_shortcode(image_server_path, author_name, unsplash_url)
    if image_shortcode not in post.content:
        post.content += f"\n{image_shortcode}\n"

    with open(post_path, "w") as f:
        f.write(frontmatter.dumps(post))


def sanitise_post_path(post_path: str) -> str:
    post_path = os.path.abspath(
        os.path.join(dirname, "..", str(post_path).strip("/"))
    )
    if not os.path.isfile(post_path):
        raise RuntimeError("Post path does not exist", post_path)
    return post_path


def unsplash_image_page_html(unsplash_url: str) -> str:
    validate_unsplash_url(unsplash_url)
    get = requests.get(unsplash_url)
    if get.status_code != 200:
        raise RuntimeError("Non-200 response", unsplash_url, get.text)
    return get.text


def validate_unsplash_url(url: str):
    parsed_url = urlparse(url)
    if parsed_url.hostname != "unsplash.com":
        raise ValueError("Non-Unsplash url", url)


def extract_author(soup: BeautifulSoup) -> str:
    author_element = soup.select_one("div>span>a[href^='/@']")  # type: PageElement
    if not author_element:
        raise RuntimeError("Failed to find author element")
    # noinspection PyUnresolvedReferences
    return str(author_element.text).strip()


def extract_image_id(unsplash_url: str) -> str:
    search = re.search(r"unsplash\.com/photos/([a-zA-Z0-9-]+)", unsplash_url)
    if not search or not search.group(1):
        raise RuntimeError("Failed to extract image id from", unsplash_url)

    return search.group(1)


def extract_download_url(soup: BeautifulSoup) -> str:
    download_element = soup.select_one("a[href*=download][download]")
    if not download_element:
        raise RuntimeError("Failed to find download element")
    req = requests.PreparedRequest()
    # noinspection PyUnresolvedReferences
    req.prepare_url(str(download_element["href"]).strip(), {"w": "768"})
    return req.url


def make_image_server_path(post_date: date, author_name: str, image_id: str) -> str:
    author_slug = re.sub(r"[^a-z0-9]+", "-", author_name.strip().lower())
    return f"/img/{post_date.year}/{post_date.strftime('%m')}/{author_slug}-{image_id}-unsplash.jpg"


def make_image_local_path(image_server_path: str) -> str:
    file_path = os.path.join(dirname, "../static", image_server_path.strip("/"))
    pathlib.Path(os.path.dirname(file_path)).mkdir(parents=True, exist_ok=True)
    return file_path


def download_image_file(unsplash_url: str, file_path: str):
    r = requests.get(unsplash_url, stream=True)
    if r.status_code != 200:
        raise RuntimeError("Non-200 response from image download request")
    with open(file_path, "wb") as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)


def make_image_shortcode(image_server_path: str, author_name: str, unsplash_url: str) -> str:
    return f"""
{{{{< figure src="{image_server_path}"
           attr="Photo by {author_name} on Unsplash"
           attrlink="{unsplash_url}" >}}}}
""".strip()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fetch and set up the image on a post.")
    parser.add_argument("post_path", type=str, help="path to the post markdown file",
                        default="*", nargs="?")
    args = parser.parse_args()
    if not args.post_path or args.post_path == "*":
        posts_glob = glob.glob(os.path.join(dirname, "../content/articles/**/*.md"), recursive=True)
        with Pool(8) as pool:
            pool.map(set_image_for_post, posts_glob)
    else:
        set_image_for_post(args.post_path)

Tech mentioned