用户注册



邮箱:

密码:

用户登录


邮箱:

密码:
记住登录一个月忘记密码?

发表随想


还能输入:200字
云代码 - python代码库

搜索贴吧内容,摘取其标题、发帖人、发帖时间、评论数,并保存到数据库

2019-02-21 作者:芙蓉妹妹举报

[python]代码库

'''
针对贴吧前5页(可改)实现功能:
1、保存所查询的网页内容到文件
2、摘取每个帖子的属性信息(标题,发帖人,发帖时间,评论数),并保存到数据库中
3.根据标题从数据库中搜索帖子
'''
from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import sqlite3
import os

key = input('请输入一个查询关键字')
#key = '芙蓉'
args = {
    'kw': key,
    'ie': 'utf-8'
}
url1 = 'http://tieba.baidu.com/f?' + urlencode(args)

def get_one_page(index):
    url = url1 + '&pn={}'.format(index * 50)
    response = urlopen(url)
    return response.read().decode()

def save_one_page(index, html):
    filename = 'tieba\\tieba_{}.html'.format(index + 1)
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html)
    pass

db_file = 'tieba.db'
def create_table():
    conn = sqlite3.connect(db_file)#1连接数据库
    cursor = conn.cursor()#2创建执行对象
    cursor.execute('''
       create table tieba(
          id integer primary  key  autoincrement ,
          title text,
          author  text,
          time text,
          num int 
       )
    ''')#3
    conn.commit()#4.提交操作,对于可以修改数据库内容的语句必须提交
    conn.close()#5.关闭连接

def save(tieba):
    #连接
    conn = sqlite3.connect(db_file)
    #创建执行对象
    cursor = conn.cursor()
    #执行SQL语句
    cursor.execute('''
        insert into tieba
        (title,author,time,num)
        values 
        (?, ?, ?, ?)
    ''',(tieba.get('title'),tieba.get('author'),tieba.get('time'),
         tieba.get('num')))
    #提交
    conn.commit()
    #关闭
    conn.close()


# 根据标题关键字查询数据库
def find_by_title(key):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    result = cursor.execute('''
       select * from tieba
       where title like ?
    ''', ('%'+key+'%',))
    #查询不需要提交
    ls = []
    for row in result:
        movie = {}
        movie['id'] = row[0]
        movie['title'] = row[1]
        movie['auther'] = row[2]
        movie['time'] = row[3]
        movie['num'] = row[4]

        ls.append(movie)

    conn.close()
    return  ls

def get_tieba_info(html):

    soup = BeautifulSoup(html, 'html.parser')

    ls_con = soup.select('#thread_list li')
    print(len(ls_con))

    ls = []#定义一个空列表,用来存放贴吧的信息

    for con in ls_con:
        tieba = {}
        a = con.find('a', attrs={"class": 'j_th_tit'})
        # print(a)
        if a == None:
            continue
        else:
            title = a.get('title')
            # print(title)
            tieba['title'] = title

        p = con.find('span', attrs={'class': 'tb_icon_author'})
        author = p.get('title')
        # print(author)
        author = author.replace('\n', '')  # 去掉字符串结尾的\n
        author = author.replace('主题作者:', '')  # 去掉字符串中的多余字符
        # print(author)
        tieba['author'] = author

        p = con.find('span', attrs={'class': 'pull-right'})
        # print(p)
        time = p.get_text()
        # print(time)
        tieba['time'] = time

        p = con.find('span', attrs={'class': 'threadlist_rep_num'})
        # print(p)
        num = p.get_text()
        # print(num)
        tieba['num'] = num

        ls.append(tieba)
    return ls


if __name__ == '__main__':
    if not os.path.exists(db_file):#若已经存在就不再创建新表
       create_table()
    tieba_list = []
    '''
    #仅保存第一页的代码
    html = get_one_page(0)
    get_tieba_info(html)
    tieba_list += get_tieba_info(html)
    print(tieba_list)
    '''

   #保存到数据库,最后要测试
    for index in range(0, 5):
        html = get_one_page(index)
        tieba_list += get_tieba_info(html)
        # 保存网页到文件
        save_one_page(index, html)
        
    #把数据保存到表中
    for t in tieba_list:
           save(t)

    key = input('请输入一个关键词')
    ls = find_by_title(key)
    for t in ls:
        print(t)

[代码运行效果截图]


搜索贴吧内容,摘取其标题、发帖人、发帖时间、评论数,并保存到数据库


分享到:
更多

网友评论    (发表评论)

共2 条评论 1/1页

发表评论:

评论须知:

  • 1、评论每次加2分,每天上限为30;
  • 2、请文明用语,共同创建干净的技术交流环境;
  • 3、若被发现提交非法信息,评论将会被删除,并且给予扣分处理,严重者给予封号处理;
  • 4、请勿发布广告信息或其他无关评论,否则将会删除评论并扣分,严重者给予封号处理。