from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import re |
pages = set () |
def getLinks(pageUrl): |
global pages |
html = urlopen( "http://en.wikipedia.org" + pageUrl) |
bsObj = BeautifulSoup(html, 'html.parser' ) |
try : |
print (bsObj.h1.get_text()) # 打印标题 |
print (bsObj.find( id = "mw-content-text" ).findAll( 'p' )[ 0 ]) # 打印正文 |
print (bsObj.find( id = "ca-edit" ).find( "span" ).find( "a" ).attrs[ 'href' ]) # 打印编辑链接 |
except AttributeError: |
print ( "页面缺少一些属性!不过不用担心!" ) |
for link in bsObj.findAll( "a" , href = re. compile ( "^(/wiki/)" )): |
if 'href' in link.attrs: |
if link.attrs[ 'href' ] not in pages: |
newPage = link.attrs[ 'href' ] |
print ( "--------------\n" + newPage) |
pages.add(newPage) |
getLinks(newPage) |
getLinks("") |