[python]代码库
import re,requests,openpyxl,time,bs4,threading
from lxml import etree
header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70'}
i,n=1,0
Name=[]
web_url={}
addr={}
lock=threading.RLock()
def getUrl():
url = 'https://www.c345y.com/vmovie/sm/'
global i,web_url
lock.acquire()
if i==1:
url=url
else:
url=url+'index_'+str(i)+'.html'
print('----------------' + str(i) + '----------')
i+=1
lock.release()
try:
res=requests.get(url=url,headers=header,timeout=30)
res.close()
time.sleep(0.5)
res.encoding='utf8'
html=etree.HTML(res.text)
itms=html.xpath('/html/body/div[6]/div/div[2]//dl')[0]
url = itms.xpath('//dl/dt/a/@href')
name = itms.xpath('//dl/dt/a/@title')
lock.acquire()
for j in range(0, len(name)):
web_url[name[j]] = 'https://www.c234y.com' + url[j + 4]
lock.release()
except:
print('访问'+url+'失败!')
def gerAddr():
global addr,n,mz,dz
lock.acquire()
print('----------------' + str(n+1) + '----------')
try:
print(mz[n], dz[n])
mz1=mz[n]
dz1=dz[n]
n += 1
lock.release()
res2 = requests.get(url=dz1,headers=header,timeout=30)
res2.close()
html2 = etree.HTML(res2.text)
addr[mz1] = html2.xpath('//*[@id="url"]/@value')[0]
time.sleep(0.5)
except:
print('访问失败')
n += 1
#if lock.
lock.release()
print(time.asctime())
for k in range(0,16):
t1=threading.Thread(target=getUrl)
t1.start()
t2=threading.Thread(target=getUrl)
t2.start()
t3=threading.Thread(target=getUrl)
t3.start()
t4=threading.Thread(target=getUrl)
t4.start()
t5=threading.Thread(target=getUrl)
t5.start()
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
print(time.asctime())
mz=[]
dz=[]
file='影片列表.xlsx'
print(web_url.items())
for a,b in web_url.items():
mz.append(a)
dz.append(b)
print(mz,dz)
wb=openpyxl.load_workbook(file)
sheet=wb['Sheet1']
sheet=wb.active
sheet['A1']='序号'
sheet['B1']='影片名'
sheet['C1']='播放网址'
sheet['D1']='下载地址'
for nu in range(0,len(web_url)):
print(mz[nu - 2], dz[nu - 2])
sheet['A'+str(nu+2)]= str(nu+1)
sheet['B' + str(nu+2)] = mz[nu]
sheet['C' + str(nu+2)] = dz[nu]
wb.save(file)
print(len(mz))
flag=len(mz)
print(time.asctime())
while flag>0:
t1=threading.Thread(target=gerAddr)
t1.start()
t2=threading.Thread(target=gerAddr)
t2.start()
t3=threading.Thread(target=gerAddr)
t3.start()
t4=threading.Thread(target=gerAddr)
t4.start()
t5=threading.Thread(target=gerAddr)
t5.start()
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
flag-=5
print(addr.items())
print('----------------')
#sheet=openpyxl.load_workbook('test.xlsx')
sheet=wb.active
for nu2 in range(0,len(addr)):
sheet['D' + str(nu2+2)] = addr.get(mz[nu2])
time.sleep(2)
wb.save(file)
print(time.asctime())
# --下载部分---
import requests,openpyxl,threading,time,re
header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.70'}
lock=threading.RLock()
##失败重试代码##
# file = '影片列表.xlsx'
# path = 'D:\AV\\'
# n = 0
# name=[]
# url=[]
# wb = openpyxl.load_workbook(file)
# sheet = wb.active
# sheet = wb['Sheet']
# num=[]
# with open('失败统计.txt','r',encoding='utf-8') as f:
# lines=f.readlines()
# for line in lines:
# nu = re.findall(r'------ (.*?)----',line)[0]
# num.append(nu)
# name.append(sheet['B' + str(int(nu) + 1)].value)
# url.append(sheet['D' + str(int(nu) + 1)].value)
file='影片列表.xlsx'
path='D:\AV\\'
n=1
wb = openpyxl.load_workbook(file)
sheet = wb['Sheet']
sheet = wb.active
name=sheet['B']
url=sheet['D']
def download():
global n
lock.acquire()
n1=n
n+=1
lock.release()
##失败重试代码##
# if n1>=len(num):
# print('----已完成----')
# return None
# else:
name1=name[n1]
url1=url[n1]
time1=time.time()
print('---', str(n1),'---', name1,'----开始下载', time.asctime())
try:
res=requests.get(url=url1,headers=header,timeout=30)
with open(path+str(n1)+'、 '+name1+'.mp4','wb') as f:
f.write(res.content)
print('------',str(n1)+'----' + name1 + '下载----成功-----用时',time.time()-time1)
except:
print('------',str(n1)+'----'+name1+'下载----失败-----用时',time.time()-time1)
##失败重试代码##
#----------------------------------------------------------------------------#
# print('---', num[n1], '---', name1, '----开始下载', time.asctime())
# try:
# res = requests.get(url=url1, headers=header, timeout=300)
# with open(path +num[n1] + '、 ' + name1 + '.mp4', 'wb') as f:
# f.write(res.content)
# print('------', num[n1] + '----' + name1 + '下载----成功-----用时', time.time() - time1)
# except:
# print('------', num[n1] + '----' + name1 + '下载----失败-----用时', time.time() - time1)
for k in range(0,50):
t1=threading.Thread(target=download)
t1.start()
time.sleep(0.2)
t2=threading.Thread(target=download)
t2.start()
time.sleep(0.2)
t3=threading.Thread(target=download)
t3.start()
time.sleep(0.2)
t4=threading.Thread(target=download)
t4.start()
time.sleep(0.2)
t5=threading.Thread(target=download)
t5.start()
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()