#!/usr/bin/env python |
# encoding=utf-8 |
import requests |
import re |
import codecs |
from bs4 import BeautifulSoup |
from openpyxl import Workbook |
wb = Workbook() |
dest_filename = '电影.xlsx' |
ws1 = wb.active |
ws1.title = "电影top250" |
DOWNLOAD_URL = 'http://movie.douban.com/top250/' |
def download_page(url): |
"""获取url地址页面内容""" |
headers = { |
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' |
} |
data = requests.get(url, headers = headers).content |
return data |
def get_li(doc): |
soup = BeautifulSoup(doc, 'html.parser' ) |
ol = soup.find( 'ol' , class_ = 'grid_view' ) |
name = [] # 名字 |
star_con = [] # 评价人数 |
score = [] # 评分 |
info_list = [] # 短评 |
for i in ol.find_all( 'li' ): |
detail = i.find( 'div' , attrs = { 'class' : 'hd' }) |
movie_name = detail.find( |
'span' , attrs = { 'class' : 'title' }).get_text() # 电影名字 |
level_star = i.find( |
'span' , attrs = { 'class' : 'rating_num' }).get_text() # 评分 |
star = i.find( 'div' , attrs = { 'class' : 'star' }) |
star_num = star.find(text = re. compile ( '评价' )) # 评价 |
info = i.find( 'span' , attrs = { 'class' : 'inq' }) # 短评 |
if info: # 判断是否有短评 |
info_list.append(info.get_text()) |
else : |
info_list.append( '无' ) |
score.append(level_star) |
name.append(movie_name) |
star_con.append(star_num) |
page = soup.find( 'span' , attrs = { 'class' : 'next' }).find( 'a' ) # 获取下一页 |
if page: |
return name, star_con, score, info_list, DOWNLOAD_URL + page[ 'href' ] |
return name, star_con, score, info_list, None |
def main(): |
url = DOWNLOAD_URL |
name = [] |
star_con = [] |
score = [] |
info = [] |
while url: |
doc = download_page(url) |
movie, star, level_num, info_list, url = get_li(doc) |
name = name + movie |
star_con = star_con + star |
score = score + level_num |
info = info + info_list |
for (i, m, o, p) in zip (name, star_con, score, info): |
col_A = 'A%s' % (name.index(i) + 1 ) |
col_B = 'B%s' % (name.index(i) + 1 ) |
col_C = 'C%s' % (name.index(i) + 1 ) |
col_D = 'D%s' % (name.index(i) + 1 ) |
ws1[col_A] = i |
ws1[col_B] = m |
ws1[col_C] = o |
ws1[col_D] = p |
wb.save(filename = dest_filename) |
if __name__ = = '__main__' : |
main() |
初级程序员
by: 云代码会员 发表于:2022-08-07 15:02:55 顶(0) | 踩(0) 回复
运行之后怎么没有反应?
回复评论