diff options
Diffstat (limited to 'ircbot/urltitle.py')
| -rw-r--r-- | ircbot/urltitle.py | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/ircbot/urltitle.py b/ircbot/urltitle.py new file mode 100644 index 0000000..2d06fdc --- /dev/null +++ b/ircbot/urltitle.py @@ -0,0 +1,71 @@ +import re +import requests +import html + +url_pattern = re.compile( + r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])', + re.IGNORECASE +) + +title_regex = re.compile( + r'<title[^>]*>([^>]*?)</title>', + re.IGNORECASE | re.DOTALL +) + +headers = { + "User-Agent": "spider/2.1", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", + "Connection": "keep-alive", + "DNT": "1", + "Upgrade-Insecure-Requests": "1", +} + +def url_titles(text): + found_urls = url_pattern.findall(text) + unique_urls = [] + if len(found_urls) > 0: + url = found_urls[0] + unique_urls = [url] + results = [] + MAX_CONTENT_SIZE = 5 * 1024 * 1024 + for url in unique_urls: + try: + with requests.get( + url, + timeout=10, + allow_redirects=True, + stream=True, + headers=headers + ) as response: + if response.status_code != requests.codes.ok: + print("html resp error:" + str(response.status_code)) + continue + content_type = response.headers.get('Content-Type', '').lower() + if 'text/html' not in content_type: + print("not html") + continue + content = b'' + for chunk in response.iter_content(chunk_size=8192): + content += chunk + if len(content) >= MAX_CONTENT_SIZE: + break + encoding = 'utf-8' + html_doc = content.decode(encoding, errors='ignore') + match = title_regex.search(html_doc) + if match: + title_content = match.group(1).strip() + if title_content: + results.append(html.unescape(title_content)) + else: + print('title empty') + else: + print('title not found in html') + except Exception as e: + print(e) + return results + +if __name__ == '__main__': + print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form')) + print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/'))
\ No newline at end of file |
