[python]代码库
import time
import requests
from bs4 import BeautifulSoup
import re
import csv
def get_links(url):
# 发送GET请求并获取页面内容
response = requests.get(url)
# 解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 提取所有链接
links = []
for link in soup.find_all('a'):
href = link.get('href')
if href and href.startswith('http'):
links.append(href)
save_data(href, "", f"data_url.csv")
return links
def scrape_data(url):
# 发送GET请求获取网页内容
response = requests.get(url)
# 获取网页内容的编码
# encoding = response.encoding
encoding = 'gbk'
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
# 剔除标签并获取文本内容
text = soup.get_text()
# 去除多余的空格和换行符
text = re.sub(r'\s+', ' ', text).strip()
# 解码unicode字符
decoded_text = text.encode(encoding, errors='ignore').decode(encoding)
return decoded_text
def save_data(url, data, filename):
with open(filename, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([url, data]) # 写入数据行
print(f"保存成功: {filename}")
def crawl(start_url, depth=2):
visited = set() # 存储已访问的链接
queue = [(start_url, 0)] # 使用队列实现广度优先搜索,同时记录深度
while queue:
url, level = queue.pop(0) # 从队列中取出一个链接
if level > depth: # 达到指定深度则停止
break
try:
if url not in visited:
visited.add(url)
print(f"正在爬取: {url}")
# 爬取数据
data = scrape_data(url)
# 保存数据
save_data(url, data, f"data_{level}.csv")
# 获取链接
links = get_links(url)
# 将链接添加到队列中
queue.extend([(link, level + 1) for link in links])
# time.sleep(1)
except Exception as e:
print(f"发生异常: {str(e)},跳过链接: {url}")
continue
start_url = 'https://www.ferrari.com/zh-CN/news'
crawl(start_url, depth=2)