[python]代码库
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import urlretrieve
import urllib.request
import os
import re
def url_open(url):
req = urllib.request.Request(url) # 生成一个request对象,因为下面需要添加文件头,使程序更像是浏览器在访问
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')
response = urllib.request.urlopen(url)
html = response.read()
return html
def find_img(url):
html = urlopen(url)
bsObj = BeautifulSoup(html,"html.parser")
images = bsObj.findAll('img',{'data-original':re.compile('^http://.+jpg$')})
img_sites = []
for img in images:
img_sites.append(img.attrs["data-original"])
return img_sites
'''
def save_img(folder,img_sites):
for each in img_sites: # 遍历img地址列表
filename = each.split('/')[-1] # -1表示去分割的最后一个部分,就是图片文件的保存名
with open(folder+'\\'+filename,'wb') as f:
img = url_open(each) # 得到从图片链接下载的编码信息
f.write(img)
'''
def save_img(folder,img_sites):
for each in img_sites: # 遍历img地址列表
filename = each.split('/')[-1] # -1表示去分割的最后一个部分,就是图片文件的保存名
if not os.path.exists(filename): # 如果此文件还没有下载下来
urlretrieve(each,filename)
def get_htmlNums(html_liebiao):
html_nums = []
with open('G:\\img\liebiao.txt', 'w') as f: # 将html文本存入文件
f.write(html_liebiao)
with open('G:\\img\liebiao.txt', 'r') as f: # 打开文件文本
for each in f:
a = each.find('target="_blank" href="/dcbbs/d16_')
if a != -1:
b = each.find('.html',a) # 从a开始找
if b != -1:
html_num = each[a + 33:b]
html_nums.append(html_num) # 将网页列表中的每个子页号码追加
else:
b = a + 33 # 跳过已经找到的a的位置
a = each.find('target="_blank" href="/dcbbs/d16_',b)
return html_nums
def download_img_zol(folder='G:\\img',time = 6):
# if os.path.exists(folder):
# os.chdir(folder)
# else:
# os.mkdir(folder)
nums = []
wenjianjia = 1
for i in range(2,time):
# 返回此列表页的所有子页号集
nums = get_htmlNums(url_open('http://bbs.zol.com.cn/dcbbs/d16_good_p'+str(i)+'.html#c').decode('gbk'))
# 遍历每个子页号码
for each in nums:
url = 'http://bbs.zol.com.cn/dcbbs/d16_'+str(each)+'.html' # 组合成要显示图片的某页面
# html = url_open(url).decode('gbk')
img_sites = find_img(url) # 返回图片地址列表
print(os.getcwd()) # 打印当前路径
if not os.path.exists('G:\\img\\img'+str(i)+'\\img'+str(wenjianjia)): # 如果不存在例如G:\\img7\\img1目录
os.makedirs('G:\\img\\img'+str(i)+'\\img' + str(wenjianjia))
os.chdir('G:\\img\\img'+str(i)+'\\img'+str(wenjianjia))
else: # 如果存在例如G:\\img\\img1
os.chdir('G:\\img\\img'+str(i)+'\\img' + str(wenjianjia))
print(os.getcwd()) # 打印当前路径
save_img(folder+'\\img'+str(i)+'\\img'+str(wenjianjia),img_sites) # 保存所有图片
wenjianjia += 1
wenjianjia = 1
download_img_zol()