1 files changed, 71 insertions, 0 deletions
diff --git a/ircbot/urltitle.py b/ircbot/urltitle.py
new file mode 100644
index 0000000..2d06fdc
--- /dev/null
+++ b/ircbot/urltitle.py
@@ -0,0 +1,71 @@
+import re
+import requests
+import html
+
+url_pattern = re.compile(
+    r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])',
+    re.IGNORECASE
+)
+
+title_regex = re.compile(
+    r'<title[^>]*>([^>]*?)</title>', 
+    re.IGNORECASE | re.DOTALL
+)
+
+headers = {
+    "User-Agent": "spider/2.1",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+    "Connection": "keep-alive",
+    "DNT": "1", 
+    "Upgrade-Insecure-Requests": "1",
+}
+
+def url_titles(text):
+    found_urls = url_pattern.findall(text)
+    unique_urls = []
+    if len(found_urls) > 0:
+        url = found_urls[0]
+        unique_urls = [url]
+    results = []
+    MAX_CONTENT_SIZE = 5 * 1024 * 1024
+    for url in unique_urls:
+        try:
+            with requests.get(
+                url, 
+                timeout=10, 
+                allow_redirects=True,
+                stream=True,
+                headers=headers
+            ) as response:
+                if response.status_code != requests.codes.ok:
+                    print("html resp error:" + str(response.status_code))
+                    continue
+                content_type = response.headers.get('Content-Type', '').lower()
+                if 'text/html' not in content_type:
+                    print("not html")
+                    continue
+                content = b''
+                for chunk in response.iter_content(chunk_size=8192):
+                    content += chunk
+                    if len(content) >= MAX_CONTENT_SIZE:
+                        break
+                encoding = 'utf-8'
+                html_doc = content.decode(encoding, errors='ignore')
+                match = title_regex.search(html_doc)
+                if match:
+                    title_content = match.group(1).strip()
+                    if title_content:
+                        results.append(html.unescape(title_content))
+                    else:
+                        print('title empty')
+                else:
+                    print('title not found in html')
+        except Exception as e:
+            print(e)
+    return results
+
+if __name__ == '__main__':
+    print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form'))
+    print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/'))
+\ No newline at end of file