import urllib.request as request |
import easygui |
import bs4 |
import csv |
import os |
def is_connect(): |
import requests |
try : |
requests.get( "https://www.baidu.com" ) |
return True |
except : |
return False |
if is_connect(): |
here = os.getcwd() |
while True : |
link = easygui.enterbox( "请输入新闻文件保存路径" , "新闻爬虫" , here) |
try : |
open (link + r "\新闻数据.csv" , "w" ) |
break |
except : |
easygui.msgbox( "路径错误或文件已打开" ) |
url = "http://news.sohu.com/" |
req = request.Request(url,headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43" }) |
response = request.urlopen(req).read().decode( "utf-8" ) |
soup = bs4.BeautifulSoup(response, "html.parser" ) |
news = soup.findAll( "a" ) |
news2 = soup.findAll( "b" ) |
passed_news = [] |
with open (link + r "\新闻数据.csv" , "w" ,newline = " ",encoding=" utf - 8 ") as f: |
writer = csv.writer(f) |
for i in news2: |
print (i.string) |
writer.writerow([i.string]) |
for new in news: |
if not "None" in str (new.string) and len ( str (new.string).replace( " " ," ").replace(" \n "," ")) > 6 : |
passed_news.append( str (new.string).replace( " " ," ").replace(" \n "," ")) |
for new in passed_news[: - 4 ][ 1 :]: |
print (new) |
writer.writerow([new]) |
f.close() |
a = input ( "已爬取数据(按Enter键退出)" ) |
else : |
easygui.msgbox( "请连接网络" ) |
input ( "按Enter键退出" ) |
初级程序员
by: 云代码会员 发表于:2021-01-26 11:21:09 顶(0) | 踩(0) 回复
....
回复评论