# 准备
何为爬虫:网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。百度百科详情 (opens new window)
随着大数据时代的到来,人们对数据资源的需求越来越多,而爬虫是一种很好的自动采集数据的手段。推荐一个Python网络爬虫学习路线解读 (opens new window)
分享几个Python学习连接:
1.请叫我汪海 的CSDN (opens new window)
2.廖雪峰大佬的教程 (opens new window)和廖雪峰大佬的视频版教程 (opens new window)
4.爬虫框架 Scrapy (opens new window)
# 开始干活
# 安装完成
# 搜索一份爬图片的代码
前人种树,后人乘凉,感谢!
# 安装依赖
使用pip install ***
安装对应依赖
pip install urllib
1
我这使用的时候提示需要升级pip
python -m pip install --upgrade pip
# 运行项目
python main.py
1
运行,发现成功下载图片。
# 附上代码和代码地址
动手试试有惊喜或者惊喜(Practice make perfect!)。
# -*- coding:utf-8 -*-
import os
import random
import ssl
import time
import urllib.request
from bs4 import BeautifulSoup
# 请求头配置
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
# 下载地址
BASE_URL = "https://www.mzitu.com"
# 保存图片文件夹地址
BASE_DIR = "../images"
def start_work(serial_id):
picture_dir = BASE_DIR + os.sep + serial_id
if not os.path.exists(picture_dir):
os.mkdir(picture_dir)
page_count = get_page_count(serial_id)
print("%s 共%d个图片" % (serial_id, page_count))
get_image_for_serial(picture_dir,serial_id,page_count)
# 获取页数
def get_page_count(serial_id):
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
url = "%s/%s" % (BASE_URL, serial_id)
req = urllib.request.Request(url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
total_count = __get_counts(str_content)
return total_count
# 获取数量
def __get_counts(html_content):
page_count = 0
soup = BeautifulSoup(html_content, 'lxml')
data = soup.select("body > div.main > div.content > div.pagenavi > a > span")
if data and len(data) >= 3:
page_count = int(data[-2].get_text())
return page_count
# 获取图片地址
def get_image_url(html_content):
soup = BeautifulSoup(html_content, 'lxml')
data = soup.select("body > div.main > div.content > div.main-image > p > a > img")
url = None
try:
url = data[0].get("src")
except Exception as ex:
print("exception occur:%s" % ex)
return url
# 获取图片地址数组
def get_all_image_urls(serial_id, page_count):
url_list=list()
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
if page_count <= 1:
return
for x in range(1,page_count+1):
print("获取第%d张图片的地址" % x)
url = "%s/%s/%s" % (BASE_URL, serial_id, x)
req = urllib.request.Request(url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
img_url = get_image_url(str_content)
if img_url:
url_list.append(img_url)
print("第%d张图片地址是:%s" % (x, img_url))
time.sleep(random.randint(1, 2))
return url_list
# 获取图片
def get_image_for_serial(dir_path, serial_id, total_count):
for i in range(1, total_count + 1):
print("开始获取第%d张图片" % i)
get_image_for_index(dir_path, serial_id, i)
sleep_seconds = random.randint(1, 10) /10
time.sleep(sleep_seconds)
# 获取具体图片
def get_image_for_index(dir_path, serial_id, page_index):
header = {"user-agent": USER_AGENT}
context = ssl._create_unverified_context()
print("获取第%d张图片的地址" % page_index)
ref_url = "%s/%s/%s" % (BASE_URL, serial_id, page_index)
req = urllib.request.Request(ref_url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
str_content = content.decode("utf-8")
img_url = get_image_url(str_content)
if img_url:
print("第%d张图片地址是:%s" % (page_index, img_url))
print("尝试保存图片%s" % img_url)
save_img(dir_path, img_url, ref_url)
# 保存图片
def save_imgs(dir_path, img_urls):
for img_addr in img_urls:
save_img(dir_path, img_addr)
# 保存具体图片
def save_img(dir_path, img_url, ref_url):
header = {
"user-agent": USER_AGENT,
"Referer": ref_url
}
context = ssl._create_unverified_context()
req = urllib.request.Request(img_url, headers=header)
resp = urllib.request.urlopen(req, context=context)
content = resp.read()
with open(dir_path+os.sep+img_url.split('/')[-1], 'wb') as f:
f.write(content)
f.close()
print("已向目录:%s 保存文件:%s" % (dir_path, img_url.split('/')[-1]))
time.sleep(random.randint(1, 2))
if __name__ == "__main__":
vol_list = ["204061"]
for serial_id in vol_list:
start_work(serial_id)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# 小结
我学会了Python嘛?并没有!!我只是成功安装了python,然后成功运行了一个用例,算是爬虫的一个实践。我现在会的只是一些基础类型和函数。