from bs4 import BeautifulSoup |
from urllib.request import urlopen |
import re |
import pymysql |
conn = pymysql.connect(host = '127.0.0.1' ,port = 3306 , user = 'root' , passwd = '12345' , db = 'mysql' , charset = 'utf8' ) |
cur = conn.cursor() |
cur.execute( "use wikipedia" ) # 使用数据库wikipedia |
def insertPageIfNotExists(url): # page表 |
cur.execute( "select * from pages where url = %s" ,(url)) #根据半截url链接查找 |
if cur.rowcount = = 0 : #使用cur.rowcount获取结果集的条数,如果是0 |
cur.execute( "insert into pages (url) values (%s)" ,(url)) # 将此url插入数据库 |
conn.commit() |
return cur.lastrowid # 最后插入行的主键??? |
else : # 如果查询到了数据 |
return cur.fetchone()[ 0 ] # fetchone()取得结果集的下一行,返回一个单独的序列,没有可用数据则返回None,这里[0]项是id |
def insertLink(fromPageId,toPageId): # links表 |
cur.execute( "select * from links where fromPageId = %s and toPageId = %s" ,( int (fromPageId), int (toPageId))) |
if cur.rowcount = = 0 : |
cur.execute( "insert into links (fromPageId,toPageId) values(%s,%s)" ,( int (fromPageId), int (toPageId))) |
conn.commit() |
pages = set () |
def getLinks(pageUrl,recursionLevel): |
global pages |
if recursionLevel > 4 : # 如果递归次数大于4 |
return ; |
pageId = insertPageIfNotExists(pageUrl) # 返回pages表里的数据的id号 |
html = urlopen( "http://en.wikipedia.org" + pageUrl) # http://en.wikipedia.org/wiki/Kevin_Bacon |
bsObj = BeautifulSoup(html, 'html.parser' ) |
for link in bsObj.findAll( "a" ,href = re. compile ( "^(/wiki/)((?!:).)*$" )): # 找到当前词条页面内所有/wiki开头的词条链接 |
# 将页内链接存入pages,返回id,然后将两个id存入links表 |
insertLink(pageId,insertPageIfNotExists(link.attrs[ 'href' ])) # insertPageIfNotExists(link.attrs['href'])返回href在pages表里的id号 |
if link.attrs[ 'href' ] not in pages: # 如果href链接没有保存过 |
# 遇到一个新页面,加入集合并搜索里面的词条链接 |
newPage = link.attrs[ 'href' ] |
pages.add(newPage) |
getLinks(newPage,recursionLevel + 1 ) # 递归,次数+1 |
getLinks( "/wiki/Kevin_Bacon" , 0 ) |
cur.close() |
conn.close() |
by: 发表于:2017-09-19 09:43:13 顶(0) | 踩(0) 回复
??
回复评论