import requests |
from urllib.parse import urljoin |
from bs4 import BeautifulSoup |
def bfs_crawler(seed_url, max_depth): |
visited = set () # 用于存储已访问的URL |
queue = [(seed_url, 0 )] # 使用队列存储待访问的URL和对应的深度 |
while queue: |
url, depth = queue.pop( 0 ) # 从队列中取出一个URL和对应的深度 |
if depth > max_depth: |
continue |
if url in visited: |
continue |
try : |
response = requests.get(url) # 发送HTTP请求获取页面内容 |
print ( 'crawl ' + url) |
html_text = response.text |
visited.add(url) # 将URL添加到已访问集合中 |
soup = BeautifulSoup(html_text, 'html.parser' ) |
# print(soup.prettify()) # 输出页面内容 |
# todo 网页内容提取后保存到数据库 |
links = soup.find_all( 'a' ) # 提取页面中的所有链接 |
for link in links: |
absolute_url = urljoin(url, link.get( 'href' )) # 将相对URL转换为绝对URL |
if absolute_url.startswith( 'https' ) and absolute_url not in visited: |
queue.append((absolute_url, depth + 1 )) # 将新的URL添加到队列中,同时增加深度 |
except requests.exceptions.RequestException: |
continue |
seed_url = 'https://www.oreilly.com/search/?q=python&type=*&rows=10' # 设置种子URL |
max_depth = 2 # 设置最大深度 |
bfs_crawler(seed_url, max_depth) # 调用广度优先算法爬虫函数 |