[python]代码库
def getDailiIPs(): # 获取某网站第一页的代理ip集
url = 'http://www.xicidaili.com/nn/' # 这是国内高匿代理
req = urllib.request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')
html = urllib.request.urlopen(req)
bsObj = BeautifulSoup(html,'html.parser')
all_ip = []
all_duan = []
tds = bsObj.findAll('td')
for eachTd in tds:
pipei_ip = re.search(r'(([01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5])\.){3}([01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5])',str(eachTd)) # 匹配每行是否有ip
if pipei_ip != None: # 如果这行有ip地址
all_ip.append(pipei_ip.group())
pipei_duankou = re.search(r'<td>([0-9]{0,1}){2,6}</td>', str(eachTd))
if pipei_duankou != None:
all_duan.append(pipei_duankou.group().lstrip('<td>').rstrip('</td>')) # 去掉前后的td标签
all = []
for each_ip in all_ip:
each_duan = all_duan[all_ip.index(each_ip)] # 得到ip对应的端口值
all.append(each_ip + ':' + each_duan) # 组合成ip:端口的格式
return all # 这里返回的是ip带端口集
by: 发表于:2017-09-19 09:41:35 顶(0) | 踩(0) 回复
??
回复评论