import requests |
from random import choice, randint |
from lxml import etree |
import os |
from concurrent.futures import ThreadPoolExecutor |
from time import sleep |
if not os.path.exists( '华晨宇的照片' ): |
os.mkdir( '华晨宇的照片' ) |
# 获取5页的套图的URL |
def get_taotu_url(): |
taotu_urls = [] |
for i in range ( 1 , 6 ): |
url = f 'http://www.win4000.com/mt/hcy_{i}.html' #此处可根据网站修改 |
# 发送请求 获取响应 |
rep = requests.get(url) |
# print(rep.status_code) 状态码 200 |
# print(rep.text) |
html = etree.HTML(rep.text) |
taotu_url = html.xpath( '//div[@class="tab_tj"]/div/div/ul/li/a/@href' ) |
# 过滤掉无效的url |
taotu_url = [item for item in taotu_url if len (item) = = 39 ] |
# 一个页面有24个图片 |
# print(taotu_url, len(taotu_url), sep='\n') |
taotu_urls.extend(taotu_url) |
return taotu_urls |
# 进入套图详情页爬取图片 |
def get_img(url): |
# 发送请求 获取响应 |
rep = requests.get(url) |
# 解析响应 |
html = etree.HTML(rep.text) |
# 获取套图名称 最大页数 |
name = html.xpath( '//div[@class="ptitle"]/h1/text()' )[ 0 ] |
os.mkdir(r './华晨宇的照片/{}' . format (name)) |
max_page = html.xpath( '//div[@class="ptitle"]/em/text()' ) |
# 字符串替换 便于之后构造url请求 |
url1 = url.replace( '.html' , '_{}.html' ) |
# 翻页爬取这组套图的图片 |
for i in range ( 1 , int (max_page[ 0 ]) + 1 ): |
# 构造url |
url2 = url1. format (i) |
# 休眠 |
sleep(randint( 1 , 3 )) |
# 发送请求 获取响应 |
reps = requests.get(url2) |
# 解析响应 |
dom = etree.HTML(reps.text) |
# 定位提取图片下载链接 |
src = dom.xpath( '//div[@class="main-wrap"]/div[1]/a/img/@data-original' )[ 0 ] |
# 构造图片保存的名称 |
file_name = name + f '第{i}张.jpg' |
# 请求下载图片 保存图片 输出提示信息 |
img = requests.get(src).content |
with open (r './华晨宇的照片/{}/{}' . format (name, file_name), 'wb' ) as f: |
f.write(img) |
print (f '成功下载图片:{file_name}' ) |
# 主函数调用 开多线程 |
def main(): |
taotu_urls = get_taotu_url() |
with ThreadPoolExecutor(max_workers = 4 ) as exector: |
exector. map (get_img, taotu_urls) |
print ( '=================== 图片全部下载成功啦! =====================' ) |
if __name__ = = '__main__' : |
main() |
初级程序员
by: 云代码会员 发表于:2021-01-03 15:37:31 顶(0) | 踩(0) 回复
没弄好
回复评论