这个文件是显示在浏览器网站标题旁边的那个小图标。
网站一般会把它放在网站根目录,也有的是在网页里指定它的路径。
代码1:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
| import requests
from fake_useragent import UserAgent
from lxml import etree
import os
url = "https://www.w1w.cc" #末尾不要带斜杠
ua = UserAgent()
headers = {
'UserAgent': ua.random,
}
res = requests.get(url, headers=headers)
html = res.content.decode('utf-8')
# print(html)
# 解析html,尝试从标签中获取favicon
xpath_dbs = '/html/head/link[contains(@rel,"icon")]/@href'
parse_html = etree.HTML(html)
parse_favicon_list = parse_html.xpath(xpath_dbs)
# 取url主域名作为文件夹名
dir = url.split('/')[2]
if not os.path.exists(dir):
os.makedirs(dir)
for parse_url in parse_favicon_list:
favicon_url = parse_url if parse_url.startswith('http') else url+parse_url
html_bytes = requests.get(url=favicon_url,headers=headers).content
filename = favicon_url.split('/')[-1]
dir_favicon = dir + '/' + filename
with open(dir_favicon, 'wb') as f:
f.write(html_bytes)
print('从html标签中获取favicon %s 成功' % filename)
|
代码2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
| import requests
from fake_useragent import UserAgent
from lxml import etree
import os
url = "http://oa.com" #末尾不要带斜杠
favicon_url = url + "/" +"favicon.ico"
ua = UserAgent()
headers = {
'UserAgent': ua.random,
}
html_bytes = requests.get(favicon_url, headers=headers).content
if html_bytes: #如果获取到了数据
# 取url主域名作为文件夹名
dir = url.split('/')[2]
if not os.path.exists(dir):
os.makedirs(dir)
dir_favicon = dir + '/' + "favicon.ico"
with open(dir_favicon, 'wb') as f:
f.write(html_bytes)
print('从网站根目录直接获取favicon %s 成功' % filename)
else:
print("从网站根目录直接获取favicon失败,将尝试另一种方式")
|
最后,写成类的形式:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
| import requests
from fake_useragent import UserAgent
from lxml import etree
import os
class SpiderFavicon:
def __init__(self, site):
self.url = site
ua = UserAgent()
self.headers = {
'UserAgent': ua.random,
}
def get_html_bytes(self, url=None):
res = requests.get(url, headers=self.headers) if url else requests.get(self.url, headers=self.headers)
html_bytes = res.content
return html_bytes
def parse_html(self, xpath_dbs):
html = self.get_html_bytes().decode('utf-8')
# 解析html,尝试从标签中获取favicon
# xpath_dbs = '/html/head/link[contains(@rel,"icon")]/@href'
try:
parse_html = etree.HTML(html)
parse_favicon_list = parse_html.xpath(xpath_dbs)
return parse_favicon_list
except:
return []
def root_favicon(self):
favicon_url = self.url + "/" + "favicon.ico"
return True if self.save([favicon_url]) else False
def html_favicon(self):
# 解析html,尝试从标签中获取favicon
xpath_dbs = '/html/head/link[contains(@rel,"icon")]/@href'
return True if self.save(self.parse_html(xpath_dbs)) else False
def save(self, favicon_url_list):
if not favicon_url_list:
return False
for parse_url in favicon_url_list:
favicon_url = parse_url if parse_url.startswith('http') else self.url + parse_url
html_bytes = self.get_html_bytes(favicon_url)
if html_bytes:
filename = favicon_url.split('/')[-1]
dir_favicon = self.dir + '/' + filename
with open(dir_favicon, 'wb') as f:
f.write(html_bytes)
else:
return False
return True
def run(self):
self.dir = self.url.split('/')[2]
if not os.path.exists(self.dir):
os.makedirs(self.dir)
if self.root_favicon():
print('从网站根目录直接获取favicon成功')
else:
print("从网站根目录直接获取favicon失败,将尝试另一种方式")
if self.html_favicon():
print('从html标签中获取favicon成功')
else:
print('从html标签中获取favicon失败,请采用其他方式')
if __name__ == '__main__':
url = "https://www.douban.com" # 末尾不要带斜杠
spider = SpiderFavicon(url)
spider.run()
|
经我测试,在获取豆瓣的favicon时遇到了问题,其他正常网站可以爬取。