from urllib.request import urlopen |
from bs4 import BeautifulSoup |
from urllib.error import HTTPError |
import datetime |
import random |
import json |
import re |
random.seed(datetime.datetime.now()) |
def getLinks(articleUrl): |
html = urlopen( "https://en.wikipedia.org" + articleUrl) # 某个关键词页面 |
bsObj = BeautifulSoup(html, 'html.parser' ) |
internalSites = bsObj.find( "div" , { "id" : "bodyContent" }).findAll( "a" ,href = re. compile ( "^(/wiki/)((?!:).)*$" )) # 找到所有href为/wiki/开头的a标签 |
return internalSites |
def getHistoryIPs(pageUrl): |
# 编辑历史页面URL链接格式是 |
# http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history |
pageUrl = pageUrl.replace( "/wiki/" ,"") |
historyUrl = "http://en.wikipedia.org/w/index.php?title=" + pageUrl + "&action=history" |
print ( "history url is: " + historyUrl) |
html = urlopen(historyUrl) |
bsObj = BeautifulSoup(html, 'html.parser' ) |
# 找出class属性是"mw-anonuserlink"的链接 |
# 它们用IP地址代替用户名 |
ipAddresses = bsObj.findAll( 'a' ,{ 'class' : 'mw-userlink mw-anonuserlink' }) # 找到编辑页面里的所有提交修改过的作者ip |
addressList = set () |
for ipAddress in ipAddresses: |
addressList.add(ipAddress.attrs[ 'href' ].split( '/' )[ - 1 ]) # 取出a标签里的href里的ip |
return addressList |
def getCountry(ipAddress): |
try : |
response = urlopen( "http://freegeoip.net/json/" + ipAddress).read().decode( 'utf-8' ) |
except HTTPError as e: |
return None |
responseJson = json.loads(response) |
return responseJson.get( "country_code" ) |
links = getLinks( "/wiki/Python" ) |
while ( len (links) > 0 ): |
for link in links: |
print ( '-----------' ) |
historyIPs = getHistoryIPs(link.attrs[ 'href' ]) #某个编辑页面的所有ip |
for historyIP in historyIPs: |
country = getCountry(historyIP) |
if country is not None : |
print (historyIP + " is from " + country) |
newlink = links[random.randint( 0 , len (links) - 1 ).attrs( "href" )] # 随机取一个关键词页面作为新页面 |
links = getLinks(newlink) # 递归,从新的关键词页面重新获取/wiki/开头的链接 |
by: 发表于:2017-09-19 09:42:02 顶(0) | 踩(0) 回复
??
回复评论