[python]代码库
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
global pages
html = urlopen("http://en.wikipedia.org"+pageUrl)
bsObj = BeautifulSoup(html,'html.parser')
try:
print(bsObj.h1.get_text()) # 打印标题
print(bsObj.find(id="mw-content-text").findAll('p')[0]) # 打印正文
print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href']) # 打印编辑链接
except AttributeError:
print("页面缺少一些属性!不过不用担心!")
for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
print("--------------\n"+newPage)
pages.add(newPage)
getLinks(newPage)
getLinks("")
[代码运行效果截图]