
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
html = urlopen("http://en.wikipedia.org"+pageUrl)
bsObj = BeautifulSoup(html,'html.parser')
href = bsObj.findAll('a',href = re.compile("^(/wiki/)")) # 找到所有wiki开头的链接
for link in href:
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href'] # 遇到新页面
print(newPage)
pages.add(newPage) # 将遇到的新页面放进pages集合
getLinks(newPage)
getLinks("")


