''' |
针对贴吧前5页(可改)实现功能: |
1、保存所查询的网页内容到文件 |
2、摘取每个帖子的属性信息(标题,发帖人,发帖时间,评论数),并保存到数据库中 |
3.根据标题从数据库中搜索帖子 |
''' |
from urllib.request import urlopen |
from urllib.parse import urlencode |
from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import re |
import sqlite3 |
import os |
key = input ( '请输入一个查询关键字' ) |
#key = '芙蓉' |
args = { |
'kw' : key, |
'ie' : 'utf-8' |
} |
url1 = 'http://tieba.baidu.com/f?' + urlencode(args) |
def get_one_page(index): |
url = url1 + '&pn={}' . format (index * 50 ) |
response = urlopen(url) |
return response.read().decode() |
def save_one_page(index, html): |
filename = 'tieba\\tieba_{}.html' . format (index + 1 ) |
with open (filename, 'w' , encoding = 'utf-8' ) as file : |
file .write(html) |
pass |
db_file = 'tieba.db' |
def create_table(): |
conn = sqlite3.connect(db_file) #1连接数据库 |
cursor = conn.cursor() #2创建执行对象 |
cursor.execute( ''' |
create table tieba( |
id integer primary key autoincrement , |
title text, |
author text, |
time text, |
num int |
) |
''' ) #3 |
conn.commit() #4.提交操作,对于可以修改数据库内容的语句必须提交 |
conn.close() #5.关闭连接 |
def save(tieba): |
#连接 |
conn = sqlite3.connect(db_file) |
#创建执行对象 |
cursor = conn.cursor() |
#执行SQL语句 |
cursor.execute( ''' |
insert into tieba |
(title,author,time,num) |
values |
(?, ?, ?, ?) |
''' ,(tieba.get('title '),tieba.get(' author '),tieba.get(' time'), |
tieba.get( 'num' ))) |
#提交 |
conn.commit() |
#关闭 |
conn.close() |
# 根据标题关键字查询数据库 |
def find_by_title(key): |
conn = sqlite3.connect(db_file) |
cursor = conn.cursor() |
result = cursor.execute( ''' |
select * from tieba |
where title like ? |
''' , (' % '+key+' % ',)) |
#查询不需要提交 |
ls = [] |
for row in result: |
movie = {} |
movie[ 'id' ] = row[ 0 ] |
movie[ 'title' ] = row[ 1 ] |
movie[ 'auther' ] = row[ 2 ] |
movie[ 'time' ] = row[ 3 ] |
movie[ 'num' ] = row[ 4 ] |
ls.append(movie) |
conn.close() |
return ls |
def get_tieba_info(html): |
soup = BeautifulSoup(html, 'html.parser' ) |
ls_con = soup.select( '#thread_list li' ) |
print ( len (ls_con)) |
ls = [] #定义一个空列表,用来存放贴吧的信息 |
for con in ls_con: |
tieba = {} |
a = con.find( 'a' , attrs = { "class" : 'j_th_tit' }) |
# print(a) |
if a = = None : |
continue |
else : |
title = a.get( 'title' ) |
# print(title) |
tieba[ 'title' ] = title |
p = con.find( 'span' , attrs = { 'class' : 'tb_icon_author' }) |
author = p.get( 'title' ) |
# print(author) |
author = author.replace( '\n' , '') # 去掉字符串结尾的\n |
author = author.replace( '主题作者:' , '') # 去掉字符串中的多余字符 |
# print(author) |
tieba[ 'author' ] = author |
p = con.find( 'span' , attrs = { 'class' : 'pull-right' }) |
# print(p) |
time = p.get_text() |
# print(time) |
tieba[ 'time' ] = time |
p = con.find( 'span' , attrs = { 'class' : 'threadlist_rep_num' }) |
# print(p) |
num = p.get_text() |
# print(num) |
tieba[ 'num' ] = num |
ls.append(tieba) |
return ls |
if __name__ = = '__main__' : |
if not os.path.exists(db_file): #若已经存在就不再创建新表 |
create_table() |
tieba_list = [] |
''' |
#仅保存第一页的代码 |
html = get_one_page(0) |
get_tieba_info(html) |
tieba_list += get_tieba_info(html) |
print(tieba_list) |
''' |
#保存到数据库,最后要测试 |
for index in range ( 0 , 5 ): |
html = get_one_page(index) |
tieba_list + = get_tieba_info(html) |
# 保存网页到文件 |
save_one_page(index, html) |
|
#把数据保存到表中 |
for t in tieba_list: |
save(t) |
key = input ( '请输入一个关键词' ) |
ls = find_by_title(key) |
for t in ls: |
print (t) |
中级程序员
by: mujianlian 发表于:2019-03-26 15:27:45 顶(1) | 踩(1) 回复
No module named 'bs4'
网友回复
回复芙蓉妹妹 : 要引入库的
顶(1) 踩(1) 2019-04-20 11:17:52
回复评论