爬取网站的favicon.ico

这个文件是显示在浏览器网站标题旁边的那个小图标。

网站一般会把它放在网站根目录,也有的是在网页里指定它的路径。

代码1:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests
from fake_useragent import UserAgent
from lxml import etree
import os

url = "https://www.w1w.cc" #末尾不要带斜杠

ua = UserAgent()
headers = {
    'UserAgent': ua.random,
}
res = requests.get(url, headers=headers)
html = res.content.decode('utf-8')
# print(html)

# 解析html,尝试从标签中获取favicon
xpath_dbs = '/html/head/link[contains(@rel,"icon")]/@href'
parse_html = etree.HTML(html)

parse_favicon_list = parse_html.xpath(xpath_dbs)

# 取url主域名作为文件夹名
dir = url.split('/')[2]

if not os.path.exists(dir):
    os.makedirs(dir)

for parse_url in parse_favicon_list:
    favicon_url = parse_url if parse_url.startswith('http') else url+parse_url
    html_bytes = requests.get(url=favicon_url,headers=headers).content
    filename = favicon_url.split('/')[-1]
    dir_favicon = dir + '/' + filename
    
    with open(dir_favicon, 'wb') as f:
        f.write(html_bytes)
        print('从html标签中获取favicon %s 成功' % filename)

代码2

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
from fake_useragent import UserAgent
from lxml import etree
import os

url = "http://oa.com" #末尾不要带斜杠

favicon_url = url + "/" +"favicon.ico"

ua = UserAgent()
headers = {
    'UserAgent': ua.random,
}

html_bytes = requests.get(favicon_url, headers=headers).content

if html_bytes: #如果获取到了数据
    # 取url主域名作为文件夹名
    dir = url.split('/')[2]

    if not os.path.exists(dir):
        os.makedirs(dir)

    dir_favicon = dir + '/' + "favicon.ico"

    with open(dir_favicon, 'wb') as f:
        f.write(html_bytes)
        print('从网站根目录直接获取favicon %s 成功' % filename)
        
else:
    print("从网站根目录直接获取favicon失败,将尝试另一种方式")

最后,写成类的形式:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import requests
from fake_useragent import UserAgent
from lxml import etree
import os


class SpiderFavicon:
    def __init__(self, site):
        self.url = site
        ua = UserAgent()
        self.headers = {
            'UserAgent': ua.random,
        }

    def get_html_bytes(self, url=None):
        res = requests.get(url, headers=self.headers) if url else requests.get(self.url, headers=self.headers)
        html_bytes = res.content
        return html_bytes

    def parse_html(self, xpath_dbs):
        html = self.get_html_bytes().decode('utf-8')
        # 解析html,尝试从标签中获取favicon
        # xpath_dbs = '/html/head/link[contains(@rel,"icon")]/@href'
        try:
            parse_html = etree.HTML(html)
            parse_favicon_list = parse_html.xpath(xpath_dbs)
            return parse_favicon_list
        except:
            return []

    def root_favicon(self):
        favicon_url = self.url + "/" + "favicon.ico"
        return True if self.save([favicon_url]) else False

    def html_favicon(self):
        # 解析html,尝试从标签中获取favicon
        xpath_dbs = '/html/head/link[contains(@rel,"icon")]/@href'
        return True if self.save(self.parse_html(xpath_dbs)) else False

    def save(self, favicon_url_list):
        if not favicon_url_list:
            return False
        for parse_url in favicon_url_list:
            favicon_url = parse_url if parse_url.startswith('http') else self.url + parse_url
            html_bytes = self.get_html_bytes(favicon_url)
            if html_bytes:
                filename = favicon_url.split('/')[-1]
                dir_favicon = self.dir + '/' + filename

                with open(dir_favicon, 'wb') as f:
                    f.write(html_bytes)
            else:
                return False
        return True

    def run(self):
        self.dir = self.url.split('/')[2]
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        if self.root_favicon():
            print('从网站根目录直接获取favicon成功')
        else:
            print("从网站根目录直接获取favicon失败,将尝试另一种方式")
            if self.html_favicon():
                print('从html标签中获取favicon成功')
            else:
                print('从html标签中获取favicon失败,请采用其他方式')


if __name__ == '__main__':
    url = "https://www.douban.com"  # 末尾不要带斜杠
    spider = SpiderFavicon(url)
    spider.run()

经我测试,在获取豆瓣的favicon时遇到了问题,其他正常网站可以爬取。