from bs4 import BeautifulSoup |
from urllib.request import urlopen |
from urllib.request import urlretrieve |
import urllib.request |
import os |
import re |
def url_open(url): |
req = urllib.request.Request(url) # 生成一个request对象,因为下面需要添加文件头,使程序更像是浏览器在访问 |
req.add_header( 'User-Agent' , |
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0' ) |
response = urllib.request.urlopen(url) |
html = response.read() |
return html |
def find_img(url): |
html = urlopen(url) |
bsObj = BeautifulSoup(html, "html.parser" ) |
images = bsObj.findAll( 'img' ,{ 'data-original' :re. compile ( '^http://.+jpg$' )}) |
img_sites = [] |
for img in images: |
img_sites.append(img.attrs[ "data-original" ]) |
return img_sites |
''' |
def save_img(folder,img_sites): |
for each in img_sites: # 遍历img地址列表 |
filename = each.split('/')[-1] # -1表示去分割的最后一个部分,就是图片文件的保存名 |
with open(folder+'\\'+filename,'wb') as f: |
img = url_open(each) # 得到从图片链接下载的编码信息 |
f.write(img) |
''' |
def save_img(folder,img_sites): |
for each in img_sites: # 遍历img地址列表 |
filename = each.split( '/' )[ - 1 ] # -1表示去分割的最后一个部分,就是图片文件的保存名 |
if not os.path.exists(filename): # 如果此文件还没有下载下来 |
urlretrieve(each,filename) |
def get_htmlNums(html_liebiao): |
html_nums = [] |
with open ( 'G:\\img\liebiao.txt' , 'w' ) as f: # 将html文本存入文件 |
f.write(html_liebiao) |
with open ( 'G:\\img\liebiao.txt' , 'r' ) as f: # 打开文件文本 |
for each in f: |
a = each.find( 'target="_blank" href="/dcbbs/d16_' ) |
if a ! = - 1 : |
b = each.find( '.html' ,a) # 从a开始找 |
if b ! = - 1 : |
html_num = each[a + 33 :b] |
html_nums.append(html_num) # 将网页列表中的每个子页号码追加 |
else : |
b = a + 33 # 跳过已经找到的a的位置 |
a = each.find( 'target="_blank" href="/dcbbs/d16_' ,b) |
return html_nums |
def download_img_zol(folder = 'G:\\img' ,time = 6 ): |
# if os.path.exists(folder): |
# os.chdir(folder) |
# else: |
# os.mkdir(folder) |
nums = [] |
wenjianjia = 1 |
for i in range ( 2 ,time): |
# 返回此列表页的所有子页号集 |
nums = get_htmlNums(url_open( 'http://bbs.zol.com.cn/dcbbs/d16_good_p' + str (i) + '.html#c' ).decode( 'gbk' )) |
# 遍历每个子页号码 |
for each in nums: |
url = 'http://bbs.zol.com.cn/dcbbs/d16_' + str (each) + '.html' # 组合成要显示图片的某页面 |
# html = url_open(url).decode('gbk') |
img_sites = find_img(url) # 返回图片地址列表 |
print (os.getcwd()) # 打印当前路径 |
if not os.path.exists( 'G:\\img\\img' + str (i) + '\\img' + str (wenjianjia)): # 如果不存在例如G:\\img7\\img1目录 |
os.makedirs( 'G:\\img\\img' + str (i) + '\\img' + str (wenjianjia)) |
os.chdir( 'G:\\img\\img' + str (i) + '\\img' + str (wenjianjia)) |
else : # 如果存在例如G:\\img\\img1 |
os.chdir( 'G:\\img\\img' + str (i) + '\\img' + str (wenjianjia)) |
print (os.getcwd()) # 打印当前路径 |
save_img(folder + '\\img' + str (i) + '\\img' + str (wenjianjia),img_sites) # 保存所有图片 |
wenjianjia + = 1 |
wenjianjia = 1 |
download_img_zol() |