import re import requests import html url_pattern = re.compile( r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])', re.IGNORECASE ) title_regex = re.compile( r']*>([^>]*?)', re.IGNORECASE | re.DOTALL ) headers = { "User-Agent": "spider/2.1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "DNT": "1", "Upgrade-Insecure-Requests": "1", } def url_titles(text): found_urls = url_pattern.findall(text) unique_urls = [] if len(found_urls) > 0: url = found_urls[0] unique_urls = [url] results = [] MAX_CONTENT_SIZE = 5 * 1024 * 1024 for url in unique_urls: try: with requests.get( url, timeout=10, allow_redirects=True, stream=True, headers=headers ) as response: if response.status_code != requests.codes.ok: print("html resp error:" + str(response.status_code)) continue content_type = response.headers.get('Content-Type', '').lower() if 'text/html' not in content_type: print("not html") continue content = b'' for chunk in response.iter_content(chunk_size=8192): content += chunk if len(content) >= MAX_CONTENT_SIZE: break encoding = 'utf-8' html_doc = content.decode(encoding, errors='ignore') match = title_regex.search(html_doc) if match: title_content = match.group(1).strip() if title_content: results.append(html.unescape(title_content)) else: print('title empty') else: print('title not found in html') except Exception as e: print(e) return results if __name__ == '__main__': print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form')) print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/'))