发表2 分钟读完 (大约264个字)
Python爬取网络图片
直接上源码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
| ''' 用Python爬某新闻网站的照片 PS:仅测试爬虫功能,滥用后果自负 ''' import requests from hashlib import md5 import re import os
WEB_URL = 'https://new.qq.com/omn/20200207/20200207A0OSQX00.html' IMAGE_PATH='/Users/**/workspaces/pythonP/img1'
def find_img_url(): r = requests.get(WEB_URL) r.raise_for_status() r.encoding = r.apparent_encoding demo = r.text list = [] pattern1 = '<img src=".*?" class="content-picture">' results1 = re.findall(pattern1, demo) for res1 in results1: res1 = res1.replace('<img src=\"','https:') res1 = res1.replace('\" class=\"content-picture\">','') list.append(res1) pattern2 = '\"http://inews.gtimg.com/newsapp_bt/0/.*?/1000\"' results2 = re.findall(pattern2, demo) for res2 in results2: res2 = res2.replace('\"','') list.append(res2) return list
def download_image(img_url): r = requests.get(img_url) r.raise_for_status() content = r.content file_path = '{0}/{1}.{2}'.format(IMAGE_PATH, md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close()
list = find_img_url() for i in list: print(i, ' download...') download_image(i) print("一共下载{}条数据!".format(len(list)))
|