[python]代码库
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def bfs_crawler(seed_url, max_depth):
visited = set() # 用于存储已访问的URL
queue = [(seed_url, 0)] # 使用队列存储待访问的URL和对应的深度
while queue:
url, depth = queue.pop(0) # 从队列中取出一个URL和对应的深度
if depth > max_depth:
continue
if url in visited:
continue
try:
response = requests.get(url) # 发送HTTP请求获取页面内容
print('crawl ' + url)
html_text = response.text
visited.add(url) # 将URL添加到已访问集合中
soup = BeautifulSoup(html_text, 'html.parser')
# print(soup.prettify()) # 输出页面内容
# todo 网页内容提取后保存到数据库
links = soup.find_all('a') # 提取页面中的所有链接
for link in links:
absolute_url = urljoin(url, link.get('href')) # 将相对URL转换为绝对URL
if absolute_url.startswith('https') and absolute_url not in visited:
queue.append((absolute_url, depth + 1)) # 将新的URL添加到队列中,同时增加深度
except requests.exceptions.RequestException:
continue
seed_url = 'https://www.oreilly.com/search/?q=python&type=*&rows=10' # 设置种子URL
max_depth = 2 # 设置最大深度
bfs_crawler(seed_url, max_depth) # 调用广度优先算法爬虫函数
[代码运行效果截图]