# -*- coding: utf-8 -*- |
import requests |
from bs4 import BeautifulSoup |
import os |
import time |
|
def getHTMLText(url,headers): |
try : |
r = requests.get(url,headers = headers,timeout = 30 ) |
r.raise_for_status() |
r.encoding = r.apparent_encoding |
return r.text |
except : |
return "爬取失败" |
|
def parsehtml(namelist,urllist,html): |
url = 'http://www.tom61.com/' |
soup = BeautifulSoup(html, 'html.parser' ) |
t = soup.find( 'dl' ,attrs = { 'class' : 'txt_box' }) |
i = t.find_all( 'a' ) |
for link in i: |
urllist.append(url + link.get( 'href' )) |
namelist.append(link.get( 'title' )) |
print (urllist) |
print (namelist) |
return urllist,namelist |
|
def main(): |
headers = { 'User-Agent' : 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' , |
} |
urllist = [] #定义存放故事URL的列表 |
namelist = [] #定义存放故事名字的列表 |
if not os.path.exists( 'myshortStories' ): #将所有的故事放到一个目录下 |
os.mkdir( 'myshortStories' ) |
for i in range ( 1 , 20 ): #控制分页(每页有70个故事) |
if i = = 1 : #爬取故事的地址和故事名 |
url = 'http://www.tom61.com/ertongwenxue/shuiqiangushi/index.html' |
else : |
url = 'http://www.tom61.com/ertongwenxue/shuiqiangushi/index_' + str (i) + '.html' |
print ( "正在爬取第%s页的故事链接:" % (i)) |
print (url + '\n' ) |
html = getHTMLText(url,headers) |
urls,storiesNames = parsehtml(namelist,urllist,html) |
littleStories = [] |
m = 0 |
for url in urls: #通过URL在爬取具体的故事内容 |
print ( '已经爬取了' + str (((i - 1 ) * 70 + m)) + '篇文章' ) |
littlestory = '' |
storyhtml = getHTMLText(url,headers) |
soup = BeautifulSoup(storyhtml, 'html.parser' ) |
t = soup.find( 'div' , class_ = 't_news_txt' ) |
ptexts = t.find_all( 'p' ) |
for ptext in ptexts: #将一个故事作为数列的一个元素 |
storytext = ptext.text |
littlestory = littlestory + storytext |
littleStories.append(littlestory.replace( '\u3000\u3000' ,'')) |
m = m + 1 |
time.sleep( 1 ) |
myLittleStorySets = dict ( zip (storiesNames,littleStories)) #将故事名和故事内容拼接成字典的形式 |
print ( "爬取链接完成" ) |
k = 0 |
for storyName, storyContent in myLittleStorySets.items(): |
textName = 'myshortStories/' + 'Day' + str (((i - 1 ) * 70 + k)) + '.txt' #为每个故事建立一个TXT文件 |
with open (textName, 'a' , encoding = 'utf-8' ) as f: |
f.write(storyName) |
f.write( '\n' ) |
f.write(storyContent) |
k = k + 1 |
print ( "正在写入Day" + str (((i - 1 ) * 70 + k)) + "故事" ) |
time.sleep( 1 ) |
|
if __name__ = = '__main__' : |
main() |
初级程序员
by: HAAYHR 发表于:2021-12-17 16:54:37 顶(0) | 踩(0) 回复
ky
回复评论