1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
import re
import requests
import html
url_pattern = re.compile(
r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])',
re.IGNORECASE
)
title_regex = re.compile(
r'<title[^>]*>([^>]*?)</title>',
re.IGNORECASE | re.DOTALL
)
headers = {
"User-Agent": "spider/2.1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
}
def url_titles(text):
found_urls = url_pattern.findall(text)
unique_urls = []
if len(found_urls) > 0:
url = found_urls[0]
unique_urls = [url]
results = []
MAX_CONTENT_SIZE = 5 * 1024 * 1024
for url in unique_urls:
try:
with requests.get(
url,
timeout=10,
allow_redirects=True,
stream=True,
headers=headers
) as response:
if response.status_code != requests.codes.ok:
print("html resp error:" + str(response.status_code))
continue
content_type = response.headers.get('Content-Type', '').lower()
if 'text/html' not in content_type:
print("not html")
continue
content = b''
for chunk in response.iter_content(chunk_size=8192):
content += chunk
if len(content) >= MAX_CONTENT_SIZE:
break
encoding = 'utf-8'
html_doc = content.decode(encoding, errors='ignore')
match = title_regex.search(html_doc)
if match:
title_content = match.group(1).strip()
if title_content:
results.append(html.unescape(title_content))
else:
print('title empty')
else:
print('title not found in html')
except Exception as e:
print(e)
return results
if __name__ == '__main__':
print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form'))
print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/'))
|