from urllib.request import urlopen from bs4 import BeautifulSoup import re pages = set() def getLinks(pageUrl): html = urlopen("http://en.wikipedia.org"+pageUrl) bsObj = BeautifulSoup(html,'html.parser') href = bsObj.findAll('a',href = re.compile("^(/wiki/)")) # 找到所有wiki开头的链接 for link in href: if 'href' in link.attrs: if link.attrs['href'] not in pages: newPage = link.attrs['href'] # 遇到新页面 print(newPage) pages.add(newPage) # 将遇到的新页面放进pages集合 getLinks(newPage) getLinks("")