def getDailiIPs(): # 获取某网站第一页的代理ip集 |
url = 'http://www.xicidaili.com/nn/' # 这是国内高匿代理 |
req = urllib.request.Request(url) |
req.add_header( 'User-Agent' , |
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0' ) |
html = urllib.request.urlopen(req) |
bsObj = BeautifulSoup(html, 'html.parser' ) |
all_ip = [] |
all_duan = [] |
tds = bsObj.findAll( 'td' ) |
for eachTd in tds: |
pipei_ip = re.search(r '(([01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5])\.){3}([01]{0,1}\d{0,1}\d|2[0-4]\d|25[0-5])' , str (eachTd)) # 匹配每行是否有ip |
if pipei_ip ! = None : # 如果这行有ip地址 |
all_ip.append(pipei_ip.group()) |
pipei_duankou = re.search(r '<td>([0-9]{0,1}){2,6}</td>' , str (eachTd)) |
if pipei_duankou ! = None : |
all_duan.append(pipei_duankou.group().lstrip( '<td>' ).rstrip( '</td>' )) # 去掉前后的td标签 |
all = [] |
for each_ip in all_ip: |
each_duan = all_duan[all_ip.index(each_ip)] # 得到ip对应的端口值 |
all .append(each_ip + ':' + each_duan) # 组合成ip:端口的格式 |
return all # 这里返回的是ip带端口集 |
by: 发表于:2017-09-19 09:41:35 顶(0) | 踩(0) 回复
??
回复评论