from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import re |
pages = set () |
def getLinks(pageUrl): |
html = urlopen( "http://en.wikipedia.org" + pageUrl) |
bsObj = BeautifulSoup(html, 'html.parser' ) |
href = bsObj.findAll( 'a' ,href = re. compile ( "^(/wiki/)" )) # 找到所有wiki开头的链接 |
for link in href: |
if 'href' in link.attrs: |
if link.attrs[ 'href' ] not in pages: |
newPage = link.attrs[ 'href' ] # 遇到新页面 |
print (newPage) |
pages.add(newPage) # 将遇到的新页面放进pages集合 |
getLinks(newPage) |
getLinks("") |