标签: Python - 个人小站

2020-02-08发表2 分钟读完 (大约264个字)
直接上源码
'''
用Python爬某新闻网站的照片
PS:仅测试爬虫功能，滥用后果自负
'''
import requests
from hashlib import md5
import re
import os

WEB_URL = 'https://new.qq.com/omn/20200207/20200207A0OSQX00.html'
IMAGE_PATH='/Users/**/workspaces/pythonP/img1'

# 通过正则找出网页中所有的图片地址
def find_img_url():
    r = requests.get(WEB_URL)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    demo = r.text
    list = []
    # ".*?" ：正则表达式匹配任意字符串
    pattern1 = '<img src=".*?" class="content-picture">'
    results1 = re.findall(pattern1, demo)
    for res1 in results1:
        # <img src="//inews.gtimg.com/newsapp_bt/0/11299639206/1000" class="content-picture">
        # https://inews.gtimg.com/newsapp_bt/0/11299639205/1000
        res1 = res1.replace('<img src=\"','https:')
        res1 = res1.replace('\" class=\"content-picture\">','')
        list.append(res1)
        # print(res1)
    pattern2 = '\"http://inews.gtimg.com/newsapp_bt/0/.*?/1000\"'
    results2 = re.findall(pattern2, demo)
    for res2 in results2:
        res2 = res2.replace('\"','')
        list.append(res2)
        # print(res2)
    return list

# 下载图片到本地
def download_image(img_url):
    r = requests.get(img_url)
    r.raise_for_status()
    content = r.content
    file_path = '{0}/{1}.{2}'.format(IMAGE_PATH, md5(content).hexdigest(), 'jpg')
    # print(file_path)
    if not os.path.exists(file_path):#os.path.exists(file_path)判断文件是否存在，存在返回1，不存在返回0
        with open(file_path, 'wb') as f:
            f.write(content)
            f.close()

list = find_img_url()
for i in list:
    print(i, ' download...') 
    download_image(i)
print("一共下载{}条数据！".format(len(list)))
标签

最新文章

归档

Your browser is out-of-date!