#抓取Top100网 (成功) |
from urllib.request import urlopen |
''' |
https://maoyan.com/board/4?offset=10 |
https://maoyan.com/board/4?offset=20 |
https://maoyan.com/board/4?offset=30 |
... |
https://maoyan.com/board/4?offset=90 |
''' |
def get_one_page(index): |
url = 'https://maoyan.com/board/4?offset=%d' % (index * 10 ) |
#另一种方式: url = 'https://maoyan.com/board/4?offset={}/'.format(index * 10) |
#字符串的格式化处理 {} 占位符 使用format方式格式化字符串 |
response = urlopen(url) |
return response.read().decode() |
def save_one_page(index,html): |
#将数据保存到文件中 |
# 文件名 maoyan_TOP100_page_inedx.html |
# 文件存到什么地方 |
filename = 'maoyan\\TOP100_page_{}.html' . format (index + 1 ) |
''' |
打开文件 打开方式:r(以字符串的方式只读),w(以字符串的方式只写),a(以字符串的方式追加) |
rb(以二进制只读),wb(以二进制只写),ab(以二进制追加) |
''' |
#file = open(filename,'w',encoding = 'utf-8') |
#file.write(html) |
#file.close() |
with open (filename, 'w' ,encoding = 'utf-8' ) as file : |
file .write(html) |
pass |
if __name__ = = '__main__' : |
for index in range ( 0 , 10 ): |
html = get_one_page(index) |
save_one_page(index,html) |
#__name__ 内置变量 |
#当执行当前文件时,__name__==_main_ |
#当当前文件时被其他文件引用时,__name__==模块名 |
初级程序员
by: 云代码会员 发表于:2020-04-22 15:49:44 顶(0) | 踩(0) 回复
啥玩意呀 爬完之后直接猫眼都不让登陆了
回复评论