import time |
import requests |
from bs4 import BeautifulSoup |
import re |
import csv |
def get_links(url): |
# 发送GET请求并获取页面内容 |
response = requests.get(url) |
# 解析HTML |
soup = BeautifulSoup(response.text, 'html.parser' ) |
# 提取所有链接 |
links = [] |
for link in soup.find_all( 'a' ): |
href = link.get( 'href' ) |
if href and href.startswith( 'http' ): |
links.append(href) |
save_data(href, " ", f" data_url.csv") |
return links |
def scrape_data(url): |
# 发送GET请求获取网页内容 |
response = requests.get(url) |
# 获取网页内容的编码 |
# encoding = response.encoding |
encoding = 'gbk' |
# 解析网页内容 |
soup = BeautifulSoup(response.text, 'html.parser' ) |
# 剔除标签并获取文本内容 |
text = soup.get_text() |
# 去除多余的空格和换行符 |
text = re.sub(r '\s+' , ' ' , text).strip() |
# 解码unicode字符 |
decoded_text = text.encode(encoding, errors = 'ignore' ).decode(encoding) |
return decoded_text |
def save_data(url, data, filename): |
with open (filename, 'a' , newline = '') as file : |
writer = csv.writer( file ) |
writer.writerow([url, data]) # 写入数据行 |
print (f "保存成功: {filename}" ) |
def crawl(start_url, depth = 2 ): |
visited = set () # 存储已访问的链接 |
queue = [(start_url, 0 )] # 使用队列实现广度优先搜索,同时记录深度 |
while queue: |
url, level = queue.pop( 0 ) # 从队列中取出一个链接 |
if level > depth: # 达到指定深度则停止 |
break |
try : |
if url not in visited: |
visited.add(url) |
print (f "正在爬取: {url}" ) |
# 爬取数据 |
data = scrape_data(url) |
# 保存数据 |
save_data(url, data, f "data_{level}.csv" ) |
# 获取链接 |
links = get_links(url) |
# 将链接添加到队列中 |
queue.extend([(link, level + 1 ) for link in links]) |
# time.sleep(1) |
except Exception as e: |
print (f "发生异常: {str(e)},跳过链接: {url}" ) |
continue |
start_url = 'https://www.ferrari.com/zh-CN/news' |
crawl(start_url, depth = 2 ) |