luciaa.at/cgbe.py

import re
import jinja2
import json
import pathlib
from datetime import datetime
import mistune
import argparse

# Globals
global_config = {
    "paths": {
        "entries_folder": "entry_data/",
        "templates_folder": "templates/",
        "templates": {
            "blog_entry": "blog_entry.html",
            "overview": "overview.html",
            "tag_overview": "tag_overview.html"
        },
        "generated_folder": "generated/",
        "generated": {
            "overview": "overview.html"
        }
    },
    "date_time": {
        "use_unix_time": False,
        "exclusively_use_unix_time": False,
        "date_ordering": "YMD",
        "date_seperator": "-",
        "show_time": False,
        "time_format": "24h",
        "show_seconds": False,
        "displayed_timezone": None
    },
    "defaults": {
        "author": None,
        "date_time": None
    }
}
verbose = False
opt = dict()


class Entry:
    def __init__(self, href=None, date=None, time=None, author=None, tags=None, content_warnings=None, heading=None,
                 html=None):
        self.href = href
        self.date = date
        self.time = time
        self.author = author
        self.tags = tags
        self.content_warnings = content_warnings
        self.heading = heading
        self.html = html

    def __str__(self):
        return f"Hyperlink: {self.href}, Date: {self.date}, Author: {self.author}, Tags: {self.tags}, " \
               f"Content Warnings: {self.content_warnings}.\nHeading: {self.heading}\nUnformatted text:\n{self.html}"


template_path = pathlib.Path(global_config["paths"]["templates_folder"])

jenv = jinja2.Environment(
    loader=jinja2.FileSystemLoader(template_path),
    autoescape=False, trim_blocks=True, lstrip_blocks=True,
    keep_trailing_newline=False)


def render_template(template_name, output_path, **kwargs):
    template = jenv.get_template(template_name)
    with open(output_path, "w") as out_file:
        out_file.write(template.render(**kwargs))


def apply_config():
    try:
        raw_json = open("configs/cgbe.json", "r", encoding="utf-8")
        config_data = json.load(raw_json)
        for cur_dict in config_data.items():
            for dict_item in cur_dict[1].items():
                global_config[cur_dict[0]][dict_item[0]] = dict_item[1]
        if verbose:
            print("The following configuration was found and has been applied:")
            print(global_config)

    except FileNotFoundError:
        print("ERROR: Config file doesn't exist in expected location.")
        print("Writing new config file.")
        data_to_write = json.dumps(global_config, indent=4)
        write_file = open("configs/cgbe.json", "w", encoding="utf-8")
        write_file.write(data_to_write)
        write_file.close()
        print("SUCCESS: config file written")


def date_time_handling(str_date_time):
    try:
        date_time = datetime.fromisoformat(str_date_time)
        return date_time
    except ValueError:
        print(
            f"ERROR: Provided datetime string invalid. Expected ISO8601 formatted datetime. Received string: "
            f"{str_date_time}")
        if verbose:
            print("Defaulting to provided default time")
    except TypeError:
        print(f"ERROR: Provided datetime isn't a string. Received type: {type(str_date_time)}")
        if verbose:
            print("Defaulting to provided default time")
    if global_config["defaults"]["date_time"]:
        try:
            if global_config["defaults"]["date_time"].lower() == "now":
                date_time = datetime.now()
            elif global_config["defaults"]["date_time"] == "0" or global_config["defaults"]["time"].lower() == "unix_0":
                date_time = datetime.fromtimestamp(0)
            elif global_config["defaults"]["date_time"] is None:
                date_time = None
                if verbose:
                    print("Default time was set to None. No Datetime Value will be provided")
            else:
                raise ValueError()
            if verbose and date_time:
                print(
                    f"Default time was set to {global_config['defaults']['date_time'].lower()}. New ISO Datetime "
                    f"{date_time}")
        except ValueError:
            print(f"Invalid value {str(global_config['defaults']['date_time'])} was provided.")

    else:
        date_time = None
        if verbose:
            print("No default time was set. Date will be left empty")
    return date_time


def format_datetime(date_time_to_format):
    formatted_date_time = ""
    time_str = ""

    # handles UNIX timestamps if they are used
    if global_config["date_time"]["use_unix_time"]:
        unix_stamp = str(int(date_time_to_format.timestamp()))
        # first formats it to an int to get rid of floating point
        if global_config["date_time"]["exclusively_use_unix_time"]:
            return unix_stamp

    # convert given seperator(s) into a list with length 3 (or longer - further elements will be ignored). Those are
    # used in the date formating afterward.
    if type(global_config["date_time"]["date_seperator"]) is list:
        if len(global_config["date_time"]["date_seperator"]) == 2:
            separators = global_config["date_time"]["date_seperator"] + [None]
        else:
            separators = global_config["date_time"]["date_seperator"]
    elif global_config["date_time"]["date_seperator"]:
        separators = [global_config["date_time"]["date_seperator"], global_config["date_time"]["date_seperator"], None]
    else:
        separators = [None, None, None]

    # formats the datetime object given into this function according to one of the possible orderings, using the
    # previously established separators.
    try:
        if global_config["date_time"]["date_ordering"].lower() == "ymd":
            formatted_date_time = (f"{date_time_to_format.year:04d}{str(separators[0] or '')}"
                                   f"{date_time_to_format.month:02d}{str(separators[1] or '')}"
                                   f"{date_time_to_format.day:02d}{str(separators[2] or '')}")
        elif global_config["date_time"]["date_ordering"].lower() == "dmy":
            formatted_date_time = (f"{date_time_to_format.day:02d}{str(separators[0] or '')}"
                                   f"{date_time_to_format.month:02d}{str(separators[1] or '')}"
                                   f"{date_time_to_format.year:04d}{str(separators[2] or '')}")
        elif global_config["date_time"]["date_ordering"].lower() == "mdy":
            formatted_date_time = (f"{date_time_to_format.month:02d}{str(separators[0] or '')}"
                                   f"{date_time_to_format.day:02d}{str(separators[1] or '')}"
                                   f"{date_time_to_format.year:04d}{str(separators[2] or '')}")
        else:
            if type(global_config["date_time"]["date_ordering"]) is str:
                raise ValueError(
                    f"ERROR: Date format string of either \"YMD\", \"DMY\" or \"MDV\" was expected. Received "
                    f"{global_config['date_time']['date_ordering']}")
            else:
                raise TypeError(
                    f"ERROR: Date format wasn't provided as str. Received type: "
                    f"{type(global_config['date_time']['date_ordering'])}")
    # TODO: beautify this
    except ValueError:
        print("Falling back to YMD formating.")
        formatted_date_time = (f"{date_time_to_format.year:04d}{str(separators[0] or '')}"
                               f"{date_time_to_format.month:02d}{str(separators[1] or '')}{date_time_to_format.day:02d}"
                               f"{str(separators[2] or '')}")
    except TypeError:
        print("Falling back to YMD formating.")
        formatted_date_time = (f"{date_time_to_format.year:04d}{str(separators[0] or '')}"
                               f"{date_time_to_format.month:02d}{str(separators[1] or '')}{date_time_to_format.day:02d}"
                               f"{str(separators[2] or '')}")

    if global_config["date_time"]["show_time"]:
        if global_config["date_time"]["show_seconds"]:
            time_str = (f"{date_time_to_format.hour:02d}:{date_time_to_format.minute:02d}:"
                        f"{date_time_to_format.second:02d}")
        else:
            time_str = f"{date_time_to_format.hour:02d}:{date_time_to_format.minute:02d}"

    # TODO: Add check if time was also provided (Regex maybe?)

    if global_config["date_time"]["displayed_timezone"]:
        formatted_date_time = f"{formatted_date_time} {global_config['date_time']['displayed_timezone']}"
        # TODO: to clean up work here as well
    return formatted_date_time


def format_text(text_to_format):
    # TODO: implement this with more options. For now, it's unused, instead will be processed in collect_entry_data.
    return mistune.html(text_to_format)


def collect_all_blog_combinations():
    folder = pathlib.Path(global_config["paths"]["entries_folder"])
    md_files = set(
        md_file.stem for md_file in folder.glob("*.md")
    )
    json_files = set(
        json_file.stem for json_file in folder.glob("*.json")
    )

    md_files_without = md_files - json_files
    if md_files_without:
        print("NOTICE: For the following .md files there's .json missing:", md_files_without)
    json_files_without = json_files - md_files
    if json_files_without:
        print("NOTICE: For the following .json files there's .md missing:", json_files_without)
    non_pair_files = md_files_without | json_files_without
    file_pairs = (md_files | json_files) - non_pair_files

    if verbose:
        print(f"The following file combinations were found and will be used for generation:{file_pairs}")

    return file_pairs


def collect_entry_data(pair_name):
    # JSON config file meta data loading and formating
    with open(f"{global_config['paths']['entries_folder']}/{pair_name}.json", "r", encoding="utf-8") as raw_json:
        metadata = json.load(raw_json)
        formated_datetime = None
        if metadata["date"]:
            given_datetime = date_time_handling(metadata["date"])
            formated_datetime = format_datetime(given_datetime)
        entry_data = Entry(href=metadata["href"], date=formated_datetime, author=metadata["author"],
                           tags=metadata["tags"],
                           content_warnings=metadata["content_warnings"])

    # extracting the entire raw text given
    text_with_heading = open(f"{global_config['paths']['entries_folder']}/{pair_name}.md", "r",
                             encoding="utf-8").read()

    # extracting of the main heading for use as the blog title in generated overviews. If there's a heading remove it
    # from the text to be formated
    with open(f"{global_config['paths']['entries_folder']}/{pair_name}.md", "r", encoding="utf-8") as raw_text:
        if heading_match := re.match(r" {,3}# +(.+)", raw_text.readline()):
            entry_data.heading = heading_match.group(1)
            entry_data.html = mistune.html(text_with_heading[:heading_match.start()] +
                                           text_with_heading[heading_match.end():])
        else:
            entry_data.html = mistune.html(text_with_heading)
    return entry_data


def collect_tags(metadata):
    # Takes all metadata of blog entries and collects tags and the count of their occurrences. Sorts and returns it.
    found_tag_occurences = {}
    for data in metadata:
        for tag in data.tags:
            if tag in found_tag_occurences:
                found_tag_occurences[tag].append(data)
            else:
                found_tag_occurences[tag] = [data]
    found_tag_occurences = dict(sorted(found_tag_occurences.items(), key=lambda x: len(x[1]), reverse=True))
    return found_tag_occurences


def generate_blog_overview(overview_data, tag_data):
    # generates general overview of the blogs on the page as well as overviews for each tag in use
    render_template(global_config["paths"]["templates"]["overview"], global_config["paths"]["generated"]["overview"],
                    blog_data=overview_data, tag_occurences=tag_data, opt=opt)


def generate_tag_overviews(tag_data):
    # generate overviews for each tag
    if tag_data:
        for tag, occurences in tag_data.items():
            render_template("tag_overview.html", f"{global_config['paths']['generated_folder']}tags/{tag}.html", tag=tag,
                            occurences=occurences, overview_backlink=global_config["paths"]["generated"]["overview"],
                            opt=opt)


def generate_blog_entries(blog_entry_data):
    for entry in blog_entry_data:
        render_template("blog_entry.html", f"{global_config['paths']['generated_folder']}{entry.href}", entry=entry,
                        overview_backlink=global_config["paths"]["generated"]["overview"], opt=opt)


if __name__ == "__main__":
    version = ""
    version_date = ""
    try:
        version_history = open("version_history.md")
        split_version_history = version_history.read().split()
        version = split_version_history[4]
        version_date = split_version_history[5][1:-1]
    except FileNotFoundError:
        if verbose:
            print("NOTICE: version_history.md was not found. Perhaps it has been removed or renamed. CGBE will be "
                  "unable to display version information")
    # License information and argument parsing
    print(f"""CatGirlBlogEngine (CGBE) {version} - {version_date}

Copyright (C) 2025  Lucia Zehentner

This program comes with ABSOLUTELY NO WARRANTY;
for details provide argument "-w".
This is free software, and you are welcome to redistribute it under certain
conditions; provide argument "-r" for details.
The full license can be displayed by providing the "-l" argument.
For contact data provide argument "-c".
    """)
    parser = argparse.ArgumentParser()

    # Adding optional argument
    parser.add_argument("-w", "--warranty", help="Display warranty information", action='store_true')
    parser.add_argument("-r", "--redistribution", help="Display conditions of redistribution",
                        action='store_true')
    parser.add_argument("-l", "--license", help="Display full license", action='store_true')
    parser.add_argument("-c", "--contact", help="Display contact information", action='store_true')
    parser.add_argument("-v", "--verbose", help="Meow a lot about literally everything!",
                        action='store_true')

    args = parser.parse_args()

    try:
        license_text = open("LICENSE", "r").read().split("\n")
        if args.warranty:
            print(license_text[588:619])
            exit(0)
        if args.redistribution:
            print(license_text[153:405])
            exit(0)
        if args.license:
            print(license_text)
            exit(0)
        if args.contact:
            print("""CONTACT ME via
    eMail:   mail@luciaa.at
    XMPP:    schlecknits@xmpp.yepoleb.at
    Matrix:  @schlecknits:chat.ohaa.xyz
    Fedi:    @schlecknits@tyrol.social

    Further contact data available at luciaa.at""")
            exit(0)
        if args.verbose:
            print("NOTICE: Verbose mode activated")
    except FileNotFoundError:
        print("WARNING: LICENSE file missing.")

    apply_config()
    blog_combinations = collect_all_blog_combinations()
    blog_data = []
    blog_data_without_date = []
    for combination in blog_combinations:
        current_data = collect_entry_data(combination)
        if current_data.date:
            blog_data.append(current_data)
        else:
            blog_data_without_date.append(current_data)
    if args.verbose and blog_data_without_date:
        print(f"NOTICE: The following entries do not contain a date and therefore will not be sorted: "
              f"{blog_data_without_date}")
    blog_data = sorted(blog_data, key=lambda x: x.date, reverse=True)
    for data in blog_data_without_date:
        blog_data.append(data)

    # TODO: remove the temporary opt assignment and find a more permanent solution
    opt["date"] = format_datetime(datetime.now())
    opt["current_site"] = "blog"

    if True:
        # TODO: replace "if True" with a configurable variable which determines if tags are used
        # TODO: find out similarity between tags, if two are very similar give out a typo warning
        #  print(f"WARNING: Tags {a} and {b} are very similar. This may be a typo.")
        tag_occurences = collect_tags(blog_data)
        if tag_occurences:
            generate_tag_overviews(tag_occurences)
        generate_blog_overview(blog_data, tag_occurences)

    else:
        generate_blog_overview(blog_data)
    generate_blog_entries(blog_data)