[python]代码库
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import re
import random
random.seed(datetime.datetime.now()) #
def getLinks(articleUrl):
html = urlopen('https://en.wikipedia.org'+articleUrl)
bsObj = BeautifulSoup(html,'html.parser')
hrefs = bsObj.find('div',{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$")) # 找到id为bodyContent的div的所有href链接
return hrefs
links = getLinks('/wiki/Kevin_Bacon')
while len(links) > 0:
newArticle = links[random.randint(0,len(links)-1)].attrs['href'] #从所有href里随机选择一个
print(newArticle)
links = getLinks(newArticle) # 从新选择的链接再取查找所有词条链接
[代码运行效果截图]