summaryrefslogtreecommitdiff
path: root/ircbot
diff options
context:
space:
mode:
authorMistivia <i@mistivia.com>2025-11-16 15:13:49 +0800
committerMistivia <i@mistivia.com>2025-11-16 15:13:49 +0800
commit07f98d9aecb6998cdc1dd649c12d6e49bed67f67 (patch)
tree05ab7f89ff9cdec98a3d8467504de5009edcaaa8 /ircbot
parent1bb8abca1549dba9c0c5660e9b3efa81ccb9d781 (diff)
update
Diffstat (limited to 'ircbot')
-rw-r--r--ircbot/test.sh2
-rw-r--r--ircbot/urltitle.py71
2 files changed, 73 insertions, 0 deletions
diff --git a/ircbot/test.sh b/ircbot/test.sh
new file mode 100644
index 0000000..e77da55
--- /dev/null
+++ b/ircbot/test.sh
@@ -0,0 +1,2 @@
+cp config.json.test config.json
+proxychains -q python main.py
diff --git a/ircbot/urltitle.py b/ircbot/urltitle.py
new file mode 100644
index 0000000..2d06fdc
--- /dev/null
+++ b/ircbot/urltitle.py
@@ -0,0 +1,71 @@
+import re
+import requests
+import html
+
+url_pattern = re.compile(
+ r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])',
+ re.IGNORECASE
+)
+
+title_regex = re.compile(
+ r'<title[^>]*>([^>]*?)</title>',
+ re.IGNORECASE | re.DOTALL
+)
+
+headers = {
+ "User-Agent": "spider/2.1",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+ "Accept-Encoding": "gzip, deflate, br",
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+ "Connection": "keep-alive",
+ "DNT": "1",
+ "Upgrade-Insecure-Requests": "1",
+}
+
+def url_titles(text):
+ found_urls = url_pattern.findall(text)
+ unique_urls = []
+ if len(found_urls) > 0:
+ url = found_urls[0]
+ unique_urls = [url]
+ results = []
+ MAX_CONTENT_SIZE = 5 * 1024 * 1024
+ for url in unique_urls:
+ try:
+ with requests.get(
+ url,
+ timeout=10,
+ allow_redirects=True,
+ stream=True,
+ headers=headers
+ ) as response:
+ if response.status_code != requests.codes.ok:
+ print("html resp error:" + str(response.status_code))
+ continue
+ content_type = response.headers.get('Content-Type', '').lower()
+ if 'text/html' not in content_type:
+ print("not html")
+ continue
+ content = b''
+ for chunk in response.iter_content(chunk_size=8192):
+ content += chunk
+ if len(content) >= MAX_CONTENT_SIZE:
+ break
+ encoding = 'utf-8'
+ html_doc = content.decode(encoding, errors='ignore')
+ match = title_regex.search(html_doc)
+ if match:
+ title_content = match.group(1).strip()
+ if title_content:
+ results.append(html.unescape(title_content))
+ else:
+ print('title empty')
+ else:
+ print('title not found in html')
+ except Exception as e:
+ print(e)
+ return results
+
+if __name__ == '__main__':
+ print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form'))
+ print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/')) \ No newline at end of file