previewbot/urlpreview.py

import aiohttp
import html.parser
import asyncio
import urllib.parse

class TitleExtractor(html.parser.HTMLParser):
    def __init__(self):
        super().__init__()
        self.current_data = None
        self.latest_title = None

    def handle_data(self, data):
        self.current_data = data

    def handle_endtag(self, tag):
        if tag == "title":
            self.latest_title = self.current_data

async def get_preview(url):
    url = url.replace("youtube.com/shorts/", "youtube.com/watch?v=")
    url_parsed = urllib.parse.urlparse(url)
    url_no_fragment = url_parsed._replace(fragment="").geturl()
    session = aiohttp.ClientSession()
    session.headers["user-agent"] = "Mozilla/5.0 XmppPreviewer/1.0"
    resp = await session.get(url_no_fragment)
    title = None
    print(url_no_fragment, resp.status)
    if resp.headers["content-type"].startswith("text/html"):
        parser = TitleExtractor()
        while chunk := await resp.content.read(4096):
            parser.feed(chunk.decode("utf-8")) # assume utf-8
            if parser.latest_title:
                title = parser.latest_title
                break
    resp.close()
    await session.close()
    return title

if __name__ == "__main__":
    async def main():
        print(await get_preview("https://youtu.be/WNFahAioGP8"))

    asyncio.run(main())