# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
#import dfdf
#保存为excel
def save_excel(result_list):
result = True
try:
workbook = Workbook()
booksheet = workbook.active
for result_row in result_list:#逐行保存数据
booksheet.append(result_row)
workbook.save("result.xlsx")
except:
result = False
print(result_list)
print("save_data_error")
return result
#处理数据,data为html数据,用beautifulsoup解析比较方便
def page_data_handle(data):
try:
bsoup = BeautifulSoup(data, 'html.parser')
data_detail = bsoup.find_all("table",id="posts-table")[0].contents#评论主题和作者数据在表格内,获取表格每行数据
i = 0
result_list = []
for data_tr in data_detail:#逐行读取表格
if i%2==1 and i!=1:#去除表格内‘空白’数据和首行数据,剩下只含评论主题的行
temp1 = data_tr.contents[1].contents[1].string.lstrip().rstrip()#获取主题
temp2 = data_tr.contents[3].contents[1].string.lstrip().rstrip()#获取作者
result_list.append([temp1, temp2])#保存数据
i = i + 1
result = result_list#返回处理后的数据
page_handle = bsoup.find_all("span",attrs={"class":"thispage"})[0]#获取当前页码所在的html代码结构
print("第"+page_handle.string+"页")
if int(page_handle.string) == int(page_handle["data-total-page"]):#当前页码和所有页码对比,是否继续获取下一页
result = False
except:
result = False
return result
#headers是模拟访问者是浏览器
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
#设置代理,就是模拟从其他电脑打开浏览器,暂无,格式如下
#proxies = {
# "http": "192.168.10.8:1025",
# "https": "185.202.123.22:562"
#}
proxies = {}
s = requests.session()#session是模拟浏览器和服务器的会话
#初始数据,设置页码为首页,设置excel首行数据
page_num = 0
result_list = [["主题", "作者"]]
while True:
#获取每一页的html代码
page_num = page_num + 1
start = (page_num-1)*20
url = "https://movie.douban.com/subject/27186619/discussion/?start="+str(start)+"&sort_by=time"
page = s.get(url=url, headers=headers, proxies=proxies)
temp = page_data_handle(page.text)#把获取到的原始数据进行处理
if temp:#继续下一页或者结束
result_list.extend(temp)#将处理后的数据与之前处理的数据合病
else:
break
temp = save_excel(result_list)#把处理后的数据保存到excel
print(temp)
#关闭session
s.close()