It’s good to have a general post image on content sites, as it gives more of an
identity to each page, and works well for
social media sharing images.
While NotesToSelf.dev doesn’t tend to have images, my Chinese
learning site and satirical tech
news site have an image for each post.
Unsplash is a nice source of general royalty-free
images that you can use for this if you give attribution.
When using Unsplash, I set up the url to the Unsplash page (not the image
itself) in the frontmatter of each post:
---
title: Foobar
unsplash_url: https://unsplash.com/photos/gKXKBY-C-Dk
---
Then I have a Python script that runs through the post files checking for the
unsplash_url field and doing the following:
- Parse the Unsplash page to get the image author’s name.
- Download the image at the desired size.
- Make a reasonable filename for the image and put it in the static directory.
- Set the image’s final server path on the post frontmatter for social markup.
- Put the image in the post body as a shortcode with an attribution link.
After running the script, the image file is at
./static/img/2020/02/manja-vitolic-gKXKBY-C-Dk-unsplash.jpg in the repo, and
the post file looks like this:
---
title: Foobar
unsplash_url: https://unsplash.com/photos/gKXKBY-C-Dk
image: /img/2020/02/manja-vitolic-gKXKBY-C-Dk-unsplash.jpg
---
{{< figure src="/img/2020/02/manja-vitolic-gKXKBY-C-Dk-unsplash.jpg"
attr="Photo by Manja Vitolic on Unsplash"
attrlink="https://unsplash.com/photos/gKXKBY-C-Dk" >}}
That uses Hugo’s built-in figure
shortcode to render
the image as an HTML5 <figure> element with an attribution link back to
Unsplash.
The script looks like this:
#!/usr/bin/python3
import argparse
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from bs4.element import PageElement
import shutil
import re
import os
from datetime import date
import pathlib
import frontmatter
import glob
from multiprocessing import Pool
dirname = os.path.dirname(__file__)
def set_image_for_post(post_path: str):
post_path = sanitise_post_path(post_path)
post = frontmatter.load(post_path)
if "image" in post.metadata:
return
if "unsplash_url" not in post.metadata:
print("No unsplash_url set on post", post_path)
return
unsplash_url = str(post["unsplash_url"]).strip()
html = unsplash_image_page_html(unsplash_url)
soup = BeautifulSoup(html, "html.parser")
author_name = extract_author(soup)
print({"author_name": author_name})
image_id = extract_image_id(unsplash_url)
print({"image_id": image_id})
image_server_path = make_image_server_path(post.metadata["date"], author_name, image_id)
print({"image_server_path": image_server_path})
image_local_path = make_image_local_path(image_server_path)
print({"image_local_path": image_local_path})
download_url = extract_download_url(soup)
print({"download_url": download_url})
download_image_file(download_url, image_local_path)
post.metadata["image"] = image_server_path
image_shortcode = make_image_shortcode(image_server_path, author_name, unsplash_url)
if image_shortcode not in post.content:
post.content += f"\n{image_shortcode}\n"
with open(post_path, "w") as f:
f.write(frontmatter.dumps(post))
def sanitise_post_path(post_path: str) -> str:
post_path = os.path.abspath(
os.path.join(dirname, "..", str(post_path).strip("/"))
)
if not os.path.isfile(post_path):
raise RuntimeError("Post path does not exist", post_path)
return post_path
def unsplash_image_page_html(unsplash_url: str) -> str:
validate_unsplash_url(unsplash_url)
get = requests.get(unsplash_url)
if get.status_code != 200:
raise RuntimeError("Non-200 response", unsplash_url, get.text)
return get.text
def validate_unsplash_url(url: str):
parsed_url = urlparse(url)
if parsed_url.hostname != "unsplash.com":
raise ValueError("Non-Unsplash url", url)
def extract_author(soup: BeautifulSoup) -> str:
author_element = soup.select_one("div>span>a[href^='/@']") # type: PageElement
if not author_element:
raise RuntimeError("Failed to find author element")
# noinspection PyUnresolvedReferences
return str(author_element.text).strip()
def extract_image_id(unsplash_url: str) -> str:
search = re.search(r"unsplash\.com/photos/([a-zA-Z0-9-]+)", unsplash_url)
if not search or not search.group(1):
raise RuntimeError("Failed to extract image id from", unsplash_url)
return search.group(1)
def extract_download_url(soup: BeautifulSoup) -> str:
download_element = soup.select_one("a[href*=download][download]")
if not download_element:
raise RuntimeError("Failed to find download element")
req = requests.PreparedRequest()
# noinspection PyUnresolvedReferences
req.prepare_url(str(download_element["href"]).strip(), {"w": "768"})
return req.url
def make_image_server_path(post_date: date, author_name: str, image_id: str) -> str:
author_slug = re.sub(r"[^a-z0-9]+", "-", author_name.strip().lower())
return f"/img/{post_date.year}/{post_date.strftime('%m')}/{author_slug}-{image_id}-unsplash.jpg"
def make_image_local_path(image_server_path: str) -> str:
file_path = os.path.join(dirname, "../static", image_server_path.strip("/"))
pathlib.Path(os.path.dirname(file_path)).mkdir(parents=True, exist_ok=True)
return file_path
def download_image_file(unsplash_url: str, file_path: str):
r = requests.get(unsplash_url, stream=True)
if r.status_code != 200:
raise RuntimeError("Non-200 response from image download request")
with open(file_path, "wb") as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
def make_image_shortcode(image_server_path: str, author_name: str, unsplash_url: str) -> str:
return f"""
{{{{< figure src="{image_server_path}"
attr="Photo by {author_name} on Unsplash"
attrlink="{unsplash_url}" >}}}}
""".strip()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fetch and set up the image on a post.")
parser.add_argument("post_path", type=str, help="path to the post markdown file",
default="*", nargs="?")
args = parser.parse_args()
if not args.post_path or args.post_path == "*":
posts_glob = glob.glob(os.path.join(dirname, "../content/articles/**/*.md"), recursive=True)
with Pool(8) as pool:
pool.map(set_image_for_post, posts_glob)
else:
set_image_for_post(args.post_path)
View post:
Python script to automate static Hugo site post images
|