[python]代码库
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
import datetime
import pymysql
conn = pymysql.connect(host='127.0.0.1',port=3306,user='root',passwd='12345',db='mysql',charset='utf8') # 所有发送到数据库的信息当成utf8
cur = conn.cursor()
cur.execute('use scrapying')
random.seed(datetime.datetime.now()) # seed设置生成随机数用的整数起始值
def store(title,content):
cur.execute("insert into pages (title,content) values (\"%s\",\"%s\")",(title,content))
cur.connection.commit() # 提交事务
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html,'html.parser')
title = bsObj.find('h1').get_text() # 得到所有标题文本
content = bsObj.find("div",{"id":"mw-content-text"}).find("p").get_text() # 得到包含正文的div标签
store(title,content) # 将标题和内容文本存入数据库
return bsObj.find("div", {"id":"bodyContent"}).findAll("a",href = re.compile("^(/wiki/)((?!:).)*$")) # .号匹配出了换行符之外的任何字符
links = getLinks("/wiki/Kevin_Bacon")
try:
while len(links) > 0: # 如果有/wiki/开头的链接存在
newArticle = links[random.randint(0,len(links)-1)].attrs["href"] # 在所有/wiki/开头的连接中随机取一个
print(newArticle)
links = getLinks(newArticle)
finally:
cur.close()
conn.close()