Python爬取网络图片

直接上源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
'''
用Python爬某新闻网站的照片
PS:仅测试爬虫功能,滥用后果自负
'''
import requests
from hashlib import md5
import re
import os

WEB_URL = 'https://new.qq.com/omn/20200207/20200207A0OSQX00.html'
IMAGE_PATH='/Users/**/workspaces/pythonP/img1'

# 通过正则找出网页中所有的图片地址
def find_img_url():
r = requests.get(WEB_URL)
r.raise_for_status()
r.encoding = r.apparent_encoding
demo = r.text
list = []
# ".*?" :正则表达式匹配任意字符串
pattern1 = '<img src=".*?" class="content-picture">'
results1 = re.findall(pattern1, demo)
for res1 in results1:
# <img src="//inews.gtimg.com/newsapp_bt/0/11299639206/1000" class="content-picture">
# https://inews.gtimg.com/newsapp_bt/0/11299639205/1000
res1 = res1.replace('<img src=\"','https:')
res1 = res1.replace('\" class=\"content-picture\">','')
list.append(res1)
# print(res1)
pattern2 = '\"http://inews.gtimg.com/newsapp_bt/0/.*?/1000\"'
results2 = re.findall(pattern2, demo)
for res2 in results2:
res2 = res2.replace('\"','')
list.append(res2)
# print(res2)
return list

# 下载图片到本地
def download_image(img_url):
r = requests.get(img_url)
r.raise_for_status()
content = r.content
file_path = '{0}/{1}.{2}'.format(IMAGE_PATH, md5(content).hexdigest(), 'jpg')
# print(file_path)
if not os.path.exists(file_path):#os.path.exists(file_path)判断文件是否存在,存在返回1,不存在返回0
with open(file_path, 'wb') as f:
f.write(content)
f.close()

list = find_img_url()
for i in list:
print(i, ' download...')
download_image(i)
print("一共下载{}条数据!".format(len(list)))

作者

Dench

发布于

2020-02-08

更新于

2020-02-08

许可协议

CC BY-NC-SA 4.0

Your browser is out-of-date!

Update your browser to view this website correctly.&npsb;Update my browser now

×