
import time |
import requests |
from bs4 import BeautifulSoup |
import re |
import csv |
def get_links(url): |
# 发送GET请求并获取页面内容 |
response = requests.get(url) |
# 解析HTML |
soup = BeautifulSoup(response.text, 'html.parser') |
# 提取所有链接 |
links = [] |
for link in soup.find_all('a'): |
href = link.get('href') |
if href and href.startswith('http'): |
links.append(href) |
save_data(href, "", f"data_url.csv") |
return links |
def scrape_data(url): |
# 发送GET请求获取网页内容 |
response = requests.get(url) |
# 获取网页内容的编码 |
# encoding = response.encoding |
encoding = 'gbk' |
# 解析网页内容 |
soup = BeautifulSoup(response.text, 'html.parser') |
# 剔除标签并获取文本内容 |
text = soup.get_text() |
# 去除多余的空格和换行符 |
text = re.sub(r'\s+', ' ', text).strip() |
# 解码unicode字符 |
decoded_text = text.encode(encoding, errors='ignore').decode(encoding) |
return decoded_text |
def save_data(url, data, filename): |
with open(filename, 'a', newline='') as file: |
writer = csv.writer(file) |
writer.writerow([url, data]) # 写入数据行 |
print(f"保存成功: {filename}") |
def crawl(start_url, depth=2): |
visited = set() # 存储已访问的链接 |
queue = [(start_url, 0)] # 使用队列实现广度优先搜索,同时记录深度 |
while queue: |
url, level = queue.pop(0) # 从队列中取出一个链接 |
if level > depth: # 达到指定深度则停止 |
break |
try: |
if url not in visited: |
visited.add(url) |
print(f"正在爬取: {url}") |
# 爬取数据 |
data = scrape_data(url) |
# 保存数据 |
save_data(url, data, f"data_{level}.csv") |
# 获取链接 |
links = get_links(url) |
# 将链接添加到队列中 |
queue.extend([(link, level + 1) for link in links]) |
# time.sleep(1) |
except Exception as e: |
print(f"发生异常: {str(e)},跳过链接: {url}") |
continue |
start_url = 'https://www.ferrari.com/zh-CN/news' |
crawl(start_url, depth=2) |



