#!/usr/bin/env python3

import os
import re
import requests
from bs4 import BeautifulSoup
import html2text

FALSEHOODS = [
	{
		"slug": "names",
		"title": "Falsehoods Programmers Believe About Names",
		"url": "https://www.kalzumeus.com/2010/06/17/falsehoods-programmers-believe-about-names/",
		"author": "Patrick McKenzie",
	},
	{
		"slug": "names-with-examples",
		"title": "Falsehoods Programmers Believe About Names (With Examples)",
		"url": "https://shinesolutions.com/2018/01/08/falsehoods-programmers-believe-about-names-with-examples/",
		"author": "Shine Solutions Group",
	},
	{
		"slug": "time",
		"title": "Falsehoods Programmers Believe About Time",
		"url": "https://gist.githubusercontent.com/timvisee/fcda9bbdff88d45cc9061606b4b923ca/raw/b8bb10ae8421d0bf32e349d4d5f8f208a23a9186/falsehoods-programming-time-list.md",
		"author": "Noah Sussman (infinite undo)",
	},
	{
		"slug": "geography",
		"title": "Falsehoods Programmers Believe About Geography",
		"url": "https://wiesmann.codiferes.net/wordpress/archives/15187",
		"author": "Matthias Wiesmann",
	},
	{
		"slug": "addresses",
		"title": "Falsehoods Programmers Believe About Addresses",
		"url": "https://www.mjt.me.uk/posts/falsehoods-programmers-believe-about-addresses/",
		"author": "Michael Tandy",
	},
	{
		"slug": "online-shopping",
		"title": "Falsehoods Programmers Believe About Online Shopping",
		"url": "https://wiesmann.codiferes.net/wordpress/archives/22201",
		"author": "Matthias Wiesmann",
	},
	{
		"slug": "bitcoin",
		"title": "Falsehoods Programmers Believe About Bitcoin",
		"url": "https://raw.githubusercontent.com/theborakompanioni/bitcoin-spring-boot-starter/refs/heads/master/docs/FALSEHOODS.md",
		"author": "bitcoin-spring-boot-starter (Thebora Kompanioni)",
	},
	{
		"slug": "cs-students",
		"title": "Things Many CS Students Still Believe After Graduating",
		"url": "https://www.netmeister.org/blog/cs-falsehoods.html",
		"author": "Jan Schaumann",
	},
	{
		"slug": "email",
		"title": "Falsehoods Programmers Believe About Email",
		"url": "https://beesbuzz.biz/code/439-Falsehoods-programmers-believe-about-email",
		"author": "Fluffy Critter",
	},
	{
		"slug": "maps",
		"title": "Falsehoods Programmers Believe About Maps",
		"url": "https://web.archive.org/web/20250516080728/http://www.atlefren.net/post/2014/09/falsehoods-programmers-believe-about-maps/",
		"author": "Unknown",
	},
	{
		"slug": "languages",
		"title": "Falsehoods Programmers Believe About Languages",
		"url": "https://www.lexiconista.com/falsehoods-about-languages/",
		"author": "Michal Mechura",
	},
	{
		"slug": "plaintext",
		"title": "Falsehoods Programmers Believe About Plaintext",
		"url": "https://jeremyhussell.blogspot.com/2017/11/falsehoods-programmers-believe-about.html#main",
		"author": "JeremyHussell",
	},
	{
		"slug": "job-applicants",
		"title": "Falsehoods Programmers Believe About Job Applicants",
		"url": "https://web.archive.org/web/20170114022820/https://medium.com/@creatrixtiara/falsehoods-programmers-believe-about-job-applicants-99280437c616",
		"author": "Creatrix Tiara",
	},
	{
		"slug": "phone-numbers",
		"title": "Falsehoods Programmers Believe About Phone Numbers",
		"url": "https://raw.githubusercontent.com/google/libphonenumber/refs/heads/master/FALSEHOODS.md",
		"author": "Google (from their libphonenumber library)",
	},
	{
		"slug": "html",
		"title": "Falsehoods Programmers Believe About HTML",
		"url": "https://www.aartaka.me.eu.org/falsehoods-html",
		"author": "Artyom Bologov",
	},
]

OUTPUT_DIR = "falsehoods"
HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}


def is_raw_markdown(url):
	return (
		"raw.githubusercontent.com" in url
		or "gist.githubusercontent.com" in url
		or url.endswith(".md")
	)


def extract_main_content(soup):
	candidates = [
		soup.find("article"),
		soup.find("main"),
		soup.find(id=re.compile(r"(content|post|entry|main|article)", re.I)),
		soup.find(class_=re.compile(r"(post-content|entry-content|article-content|post-body|blog-post|hentry)", re.I)),
		soup.find(class_=re.compile(r"(content|post|entry|article)", re.I)),
	]
	for candidate in candidates:
		if candidate:
			return candidate
	return soup.body or soup


def html_to_markdown(html, base_url=""):
	soup = BeautifulSoup(html, "html.parser")
	main = extract_main_content(soup)
	for tag in main.find_all(["nav", "header", "footer", "aside", "script", "style"]):
		tag.decompose()
	converter = html2text.HTML2Text()
	converter.ignore_links = False
	converter.ignore_images = True
	converter.body_width = 0
	converter.unicode_snob = True
	converter.wrap_links = False
	if base_url:
		converter.baseurl = base_url
	return converter.handle(str(main)).strip()


def fetch(entry):
	url = entry["url"]
	response = requests.get(url, headers=HEADERS, timeout=30)
	response.raise_for_status()
	if is_raw_markdown(url):
		body = response.text.strip()
	else:
		body = html_to_markdown(response.text, base_url=url)
	acknowledgement = f"Contributed by {entry['author']}, [source]({url})\n\n---\n\n"
	return acknowledgement + body


def main():
	os.makedirs(OUTPUT_DIR, exist_ok=True)
	for entry in FALSEHOODS:
		slug = entry["slug"]
		print(f"Fetching {slug}...", end=" ", flush=True)
		try:
			content = fetch(entry)
			output_path = os.path.join(OUTPUT_DIR, f"{slug}.md")
			with open(output_path, "w", encoding="utf-8") as f:
				f.write(content)
			print(f"saved to {output_path}")
		except Exception as e:
			print(f"FAILED: {e}")


if __name__ == "__main__":
	main()
