From cb63d6de3d3e8b747bf1b178190a183b9ab4a506 Mon Sep 17 00:00:00 2001 From: Mistivia Date: Tue, 4 Nov 2025 21:30:24 +0800 Subject: url title bot --- ircbot/main.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/ircbot/main.py b/ircbot/main.py index a7c8c3a..3d5b4e8 100644 --- a/ircbot/main.py +++ b/ircbot/main.py @@ -7,6 +7,8 @@ import os import datetime import urllib.parse import hashlib +import re +import requests config = None with open('./config.json', 'r', encoding='utf-8') as f: @@ -227,6 +229,53 @@ def roll_command(chan, sender, args): # ======================================================================== +url_pattern = re.compile( + r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])', + re.IGNORECASE +) + +title_regex = re.compile( + r']*>([^>]*?)', + re.IGNORECASE | re.DOTALL +) + +def url_titles(text): + found_urls = url_pattern.findall(text) + unique_urls = [] + if len(found_urls) > 0: + url = found_urls[0] + unique_urls = [url] + results = [] + MAX_CONTENT_SIZE = 5 * 1024 * 1024 + for url in unique_urls: + try: + with requests.get( + url, + timeout=10, + allow_redirects=True, + stream=True + ) as response: + if response.status_code != requests.codes.ok: + continue + content_type = response.headers.get('Content-Type', '').lower() + if 'text/html' not in content_type: + continue + content = b'' + for chunk in response.iter_content(chunk_size=8192): + content += chunk + if len(content) >= MAX_CONTENT_SIZE: + continue + encoding = 'utf-8' + html_doc = content.decode(encoding) + match = title_regex.search(html_doc) + if match: + title_content = match.group(1).strip() + if title_content: + results.append(title_content) + except Exception as e: + print(e) + return results + def cut_string(text, chunk_size=420): chunks = [] current_chunk = [] @@ -430,13 +479,13 @@ class IRCBot: print(f"[{target}] <{sender_nick}>: {message}") write_log(target, sender_nick, message) + reply_target = target if target.startswith('#') else sender_nick if message.startswith("!") or message.startswith("!"): try: cmd_parts = message[1:].split() cmd = cmd_parts[0].lower() args = cmd_parts[1:] - reply_target = target if target.startswith('#') else sender_nick self.handle_command(sender_nick, reply_target, cmd, args) except IndexError: pass @@ -451,11 +500,14 @@ class IRCBot: cmd = cmd_parts[0].lower() args = cmd_parts[1:] if target.startswith('#'): - reply_target = target self.handle_command(nick, reply_target, cmd, args) except IndexError: pass - + else: + titles = url_titles(message) + if len(titles) > 0: + for t in titles: + self.send_message(reply_target, '⤷ ' + t) elif command == "JOIN": args = params if len(params) >= 1: -- cgit v1.0