import requests |
import re |
import os |
import time |
headers = { |
'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" |
} |
"""请求网页""" |
response = requests.get( "https://www.vmgirls.com/15270.html" , headers = headers) |
html = response.text |
# print(html) |
"""解析网页""" |
# dir_name = re.findall('<title>(.*?)</title>', html)[-1] |
dir_name = re.findall( '<h1 class="post-title mb-3">(.*?)</h1>' , html)[ - 1 ] |
# print(dir_name) |
if not os.path.exists(dir_name): |
os.mkdir(dir_name) |
urls = re.findall( '<a rel="nofollow" href="(.*?)" alt=".*?" title=".*?">' , html) |
# print(urls) |
"""保存图片""" |
for url in urls: |
time.sleep( 2 ) |
file_name = url.split( '/' )[ - 1 ] # 文件命名 |
print (url) |
response = requests.get(url, headers = headers) |
with open (dir_name + '/' + file_name, 'wb' ) as f: |
f.write(response.content) |
f.close() |
高级设计师
by: Python自学 发表于:2022-08-16 02:15:37 顶(9) | 踩(1) 回复
自己动手封装成函数,爬取目录循环调用
回复评论