import re import requests import html url_pattern = re.compile( r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])', re.IGNORECASE ) title_regex = re.compile( r']*>([^>]*?)', re.IGNORECASE | re.DOTALL ) headers = { "User-Agent": "Mozilla/5.0 (compatible; The Lounge IRC Client; +https://github.com/thelounge/thelounge)" + " facebookexternalhit/1.1 Twitterbot/1.0", "X-Purpose": "preview", } def url_titles(text): found_urls = url_pattern.findall(text) unique_urls = [] if len(found_urls) > 0: url = found_urls[0] unique_urls = [url] results = [] MAX_CONTENT_SIZE = 5 * 1024 * 1024 for url in unique_urls: try: with requests.get( url, timeout=10, allow_redirects=True, stream=True, headers=headers ) as response: if response.status_code != requests.codes.ok: print("html resp error:" + str(response.status_code)) continue content_type = response.headers.get('Content-Type', '').lower() if 'text/html' not in content_type: print("not html") continue content = b'' for chunk in response.iter_content(chunk_size=8192): content += chunk if len(content) >= MAX_CONTENT_SIZE: break encoding = 'utf-8' html_doc = content.decode(encoding, errors='ignore') match = title_regex.search(html_doc) if match: title_content = match.group(1).strip() if title_content: results.append(html.unescape(title_content)) else: print('title empty') else: print('title not found in html') except Exception as e: print(e) return results if __name__ == '__main__': print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form')) print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/'))