blob: d367d1771f7eb1b82577776a1b38ae5bb60069dc (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
import re
import requests
import html
url_pattern = re.compile(
r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])',
re.IGNORECASE
)
title_regex = re.compile(
r'<title[^>]*>([^>]*?)</title>',
re.IGNORECASE | re.DOTALL
)
headers = {
"User-Agent": "Mozilla/5.0 (compatible; The Lounge IRC Client; +https://github.com/thelounge/thelounge)" +
" facebookexternalhit/1.1 Twitterbot/1.0",
"X-Purpose": "preview",
}
def url_titles(text):
found_urls = url_pattern.findall(text)
unique_urls = []
if len(found_urls) > 0:
url = found_urls[0]
unique_urls = [url]
results = []
MAX_CONTENT_SIZE = 5 * 1024 * 1024
for url in unique_urls:
try:
with requests.get(
url,
timeout=10,
allow_redirects=True,
stream=True,
headers=headers
) as response:
if response.status_code != requests.codes.ok:
print("html resp error:" + str(response.status_code))
continue
content_type = response.headers.get('Content-Type', '').lower()
if 'text/html' not in content_type:
print("not html")
continue
content = b''
for chunk in response.iter_content(chunk_size=8192):
content += chunk
if len(content) >= MAX_CONTENT_SIZE:
break
encoding = 'utf-8'
html_doc = content.decode(encoding, errors='ignore')
match = title_regex.search(html_doc)
if match:
title_content = match.group(1).strip()
if title_content:
results.append(html.unescape(title_content))
else:
print('title empty')
else:
print('title not found in html')
except Exception as e:
print(e)
return results
if __name__ == '__main__':
print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form'))
print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/'))
|