ircbot/urltitle.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

import re
import requests
import html

url_pattern = re.compile(
    r'(\bhttps?:\/\/[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%?=~_|])',
    re.IGNORECASE
)

title_regex = re.compile(
    r'<title[^>]*>([^>]*?)</title>', 
    re.IGNORECASE | re.DOTALL
)

headers = {
    "User-Agent": "spider/2.1",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Connection": "keep-alive",
    "DNT": "1", 
    "Upgrade-Insecure-Requests": "1",
}

def url_titles(text):
    found_urls = url_pattern.findall(text)
    unique_urls = []
    if len(found_urls) > 0:
        url = found_urls[0]
        unique_urls = [url]
    results = []
    MAX_CONTENT_SIZE = 5 * 1024 * 1024
    for url in unique_urls:
        try:
            with requests.get(
                url, 
                timeout=10, 
                allow_redirects=True,
                stream=True,
                headers=headers
            ) as response:
                if response.status_code != requests.codes.ok:
                    print("html resp error:" + str(response.status_code))
                    continue
                content_type = response.headers.get('Content-Type', '').lower()
                if 'text/html' not in content_type:
                    print("not html")
                    continue
                content = b''
                for chunk in response.iter_content(chunk_size=8192):
                    content += chunk
                    if len(content) >= MAX_CONTENT_SIZE:
                        break
                encoding = 'utf-8'
                html_doc = content.decode(encoding, errors='ignore')
                match = title_regex.search(html_doc)
                if match:
                    title_content = match.group(1).strip()
                    if title_content:
                        results.append(html.unescape(title_content))
                    else:
                        print('title empty')
                else:
                    print('title not found in html')
        except Exception as e:
            print(e)
    return results

if __name__ == '__main__':
    print(url_titles('<> https://en.wikipedia.org/wiki/A-normal_form'))
    print(url_titles('<> https://www.bilibili.com/video/BV1c31iBDEXY/'))