from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import datetime |
import re |
import random |
random.seed(datetime.datetime.now()) # |
def getLinks(articleUrl): |
html = urlopen( 'https://en.wikipedia.org' + articleUrl) |
bsObj = BeautifulSoup(html, 'html.parser' ) |
hrefs = bsObj.find( 'div' ,{ "id" : "bodyContent" }).findAll( "a" ,href = re. compile ( "^(/wiki/)((?!:).)*$" )) # 找到id为bodyContent的div的所有href链接 |
return hrefs |
links = getLinks( '/wiki/Kevin_Bacon' ) |
while len (links) > 0 : |
newArticle = links[random.randint( 0 , len (links) - 1 )].attrs[ 'href' ] #从所有href里随机选择一个 |
print (newArticle) |
links = getLinks(newArticle) # 从新选择的链接再取查找所有词条链接 |