## ********************************** 第一步:抓取该平台二手车的所有品牌 ********************************** |
# 导入第三方包 |
import requests |
from bs4 import BeautifulSoup |
import time |
# 设置伪头 |
headers = { |
'Accept' : '*/*' , |
'Accept-Encoding' : 'gzip, deflate, br' , |
'Accept-Language' : 'zh-CN,zh;q=0.9' , |
'Connection' : 'keep-alive' , |
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0' |
} |
# 二手车主页的链接 |
url = 'https://changsha.taoche.com/all/' |
# 发出抓取链接的请求并进一步解析 |
res = requests.get(url, headers = headers).text |
soup = BeautifulSoup(res, 'html.parser' ) |
# 抓取二手车品牌名称及对应的链接 |
car_brands = soup.find( 'div' ,{ 'class' : 'li_main clearfix' }).find_all( 'li' ) |
car_brands = [j for i in car_brands for j in i] |
car_brands = car_brands[ 41 :] |
brands = [i for i in car_brands] |
brands = [i.text for i in car_brands] |
urls = [ 'https://changsha.taoche.com' + i[ 'href' ] for i in car_brands] |
print (urls) |
## ********************************** 第二步:抓取该平台二手车的目标链接 ********************************** |
# 生成所需抓取的目标链接 |
target_urls = [] |
target_brands = [] |
for b,u in zip (brands,urls): |
# 抓取各品牌二手车主页下的所有页码 |
|
res = requests.get(u, headers = headers).text |
soup = BeautifulSoup(res, 'html.parser' ) |
# 查询出页数 |
if len (soup.findAll( 'div' ,{ 'class' : 'paging-box the-pages' })) = = 0 : |
pages = 1 |
else : |
pages = int ([page.text for page in soup.findAll( 'div' ,{ 'class' : 'paging-box the-pages' })[ 0 ].findAll( 'a' )][ - 2 ]) |
time.sleep( 0 ) |
|
#问题段落。 |
uu = u |
page_num = pages |
print (page_num) |
for i in range ( 1 ,page_num): |
target_brands.append(b) |
print (target_brands) |
try : |
target_urls.append(u + '?page=' + str (i) + '#pagetag' ) |
print (u) |
infos = soup.find( 'ul' ,{ 'class' : 'gongge_ul' }).find_all( 'li' ) |
except AttributeError: |
print ( '当前页空白' ) |
print (target_urls) |
## ********************************** 第三步:对该平台的二手车信息进行采集 ********************************** |
# 构建空列表,用于数据的存储 |
brand = [] |
title = [] |
boarding_time = [] |
km = [] |
discharge = [] |
sec_price = [] |
new_price = [] |
# 对每个链接发生请求 |
for b,u in zip (target_brands,target_urls): |
res = requests.get(u, headers = headers).text |
soup = BeautifulSoup(res, 'html.parser' ) |
# 每页车子的数量 |
N = len ([i.findAll( 'a' )[ 0 ][ 'title' ] for i in soup.findAll( 'div' ,{ 'class' : 'gongge_main' })]) |
print (N) |
try : |
# 车品牌 |
brands = (b + '-' ) * N |
brand.extend(brands.split( '-' )[: - 1 ]) |
# 车名称 |
title.extend([i.findAll( 'a' )[ 0 ][ 'title' ] for i in soup.findAll( 'div' ,{ 'class' : 'gongge_main' })]) |
# 二手车的上牌时间、行驶里程数等信息 |
info = [i.findAll( 'li' ) for i in soup.findAll( 'ul' ,{ 'class' : 'gongge_ul' })] |
boarding_time.extend([i[ 0 ].text[ 4 :] for i in info]) |
km.extend([i[ 1 ].text[ 4 :] for i in info]) |
discharge.extend([i[ 3 ].text[ 4 :] for i in info]) |
sec_price.extend([ float (i.findAll( 'h2' )[ 0 ].text[: - 1 ]) for i in soup.findAll( 'div' ,{ 'class' : 'gongge_main' })]) |
new_price.extend([i.findAll( 'p' )[ 0 ].text.split( '\xa0' )[ 0 ][ 5 :].strip() for i in soup.findAll( 'div' ,{ 'class' : 'gongge_main' })]) |
except IndexError: |
pass |
# 每3秒停顿一次 |
time.sleep( 0 ) |
|
## ********************************** 第四步:将采集来的数据进行存储 ********************************** |
# 数据导出 |
import pandas as pd |
cars_info = pd.DataFrame([brand,title,boarding_time,km,discharge,sec_price,new_price]).T |
cars_info = cars_info.rename(columns = { 0 : 'Brand' , 1 : 'Name' , 2 : 'Boarding_time' , 3 : 'Km' , 4 : 'Discharge' , 5 : 'Sec_price' , 6 : 'New_price' }) |
cars_info.to_csv( 'second_cars_info.csv' , index = False ) |