from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import re |
import random |
import datetime |
import pymysql |
conn = pymysql.connect(host = '127.0.0.1' ,port = 3306 ,user = 'root' ,passwd = '12345' ,db = 'mysql' ,charset = 'utf8' ) # 所有发送到数据库的信息当成utf8 |
cur = conn.cursor() |
cur.execute( 'use scrapying' ) |
random.seed(datetime.datetime.now()) # seed设置生成随机数用的整数起始值 |
def store(title,content): |
cur.execute( "insert into pages (title,content) values (\"%s\",\"%s\")" ,(title,content)) |
cur.connection.commit() # 提交事务 |
def getLinks(articleUrl): |
html = urlopen( "http://en.wikipedia.org" + articleUrl) |
bsObj = BeautifulSoup(html, 'html.parser' ) |
title = bsObj.find( 'h1' ).get_text() # 得到所有标题文本 |
content = bsObj.find( "div" ,{ "id" : "mw-content-text" }).find( "p" ).get_text() # 得到包含正文的div标签 |
store(title,content) # 将标题和内容文本存入数据库 |
return bsObj.find( "div" , { "id" : "bodyContent" }).findAll( "a" ,href = re. compile ( "^(/wiki/)((?!:).)*$" )) # .号匹配出了换行符之外的任何字符 |
links = getLinks( "/wiki/Kevin_Bacon" ) |
try : |
while len (links) > 0 : # 如果有/wiki/开头的链接存在 |
newArticle = links[random.randint( 0 , len (links) - 1 )].attrs[ "href" ] # 在所有/wiki/开头的连接中随机取一个 |
print (newArticle) |
links = getLinks(newArticle) |
finally : |
cur.close() |
conn.close() |
by: 发表于:2017-09-19 09:43:04 顶(0) | 踩(0) 回复
??
回复评论