from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import re |
import sqlite3 |
import os |
def get_one_page(index): |
url = 'https://maoyan.com/board/4?offset=%d' % (index * 10 ) |
# 另一种方式: url = 'https://maoyan.com/board/4?offset={}/'.format(index * 10) |
# 字符串的格式化处理 {} 占位符 使用format方式格式化字符串 |
response = urlopen(url) |
return response.read().decode() |
db_file = 'maoyan.db' |
def create_table(): |
conn = sqlite3.connect(db_file) #1连接数据库 |
cursor = conn.cursor() #2创建执行对象 |
cursor.execute( ''' |
create table movie( |
id integer primary key autoincrement , |
title text, |
star text, |
reltime text, |
country text, |
score float |
) |
''' ) #3 |
conn.commit() #4.提交操作,对于可以修改数据库内容的语句必须提交 |
conn.close() #5.关闭连接 |
def save(movie): |
#连接 |
conn = sqlite3.connect(db_file) |
#创建执行对象 |
cursor = conn.cursor() |
#执行SQL语句 |
cursor.execute( ''' |
insert into movie |
(title, star, reltime, country, score) |
values |
(?, ?, ?, ?, ?) |
''' ,(movie.get('title '),movie.get(' star '),movie.get(' time'), |
movie.get( 'country' ),movie.get( 'score' ))) |
#提交 |
conn.commit() |
#关闭 |
conn.close() |
# 根据标题关键字查询数据库 |
def find_by_title(key): |
conn = sqlite3.connect(db_file) |
cursor = conn.cursor() |
result = cursor.execute( ''' |
select * from movie |
where title like ? |
''' , (' % '+key+' % ',)) |
#查询不需要提交 |
ls = [] |
for row in result: |
movie = {} |
movie[ 'id' ] = row[ 0 ] |
movie[ 'title' ] = row[ 1 ] |
movie[ 'star' ] = row[ 2 ] |
movie[ 'time' ] = row[ 3 ] |
movie[ 'country' ] = row[ 4 ] |
movie[ 'score' ] = row[ 5 ] |
ls.append(movie) |
conn.close() |
return ls |
if __name__ = = '__main__' : |
if not os.path.exists(db_file): #若已经存在就不再创建新表 |
create_table() |
#保存 |
#movie = {'title': '霸王别姬', 'star': '张国荣,张丰毅,巩俐', 'time': '1993-01-01', 'score': 9.6} |
#save(movie) |
print (find_by_title( '王' )) |
''' |
在项目下会出现一个maoyan.db文件 |
1) PyCharm 右侧点击Database |
2) 点击 + 弹出菜单中, 选择Data Source, 再选择sqlite |
3) 如果有 Download missing driver files, 点击 Download |
4) 选择 file 选择需要打开的数据库文件 |
5) 能不能展开, maoyan.db |
''' |
''' |
解析贴吧列表,保存到数据库中''' |
#html = get_one_page(0) |
def get_movie_info(html): |
soup = BeautifulSoup(html, 'html.parser' ) |
ls_dd = soup.select( '.board-wrapper dd' ) |
ls = [] #定义一个空列表,用来存放电影的信息 |
for dd in ls_dd: |
movie = {} |
a = dd.find( 'a' , attrs = { "class" : 'image-link' }) |
title = a.get( 'title' ) |
#print(title) |
movie[ 'title' ] = title |
p = dd.find( 'p' , attrs = { 'class' : 'star' }) |
star = p.get_text() |
#print(star) |
# 正则表达式 |
# print(type(star)) |
star = star.replace( '\n' , '') #去掉字符串结尾的\n |
# print(star) |
regx = '^.*?:(.*?)\s' |
res = re.match(regx, star) |
#print(res.group(1)) |
movie[ 'star' ] = res.group( 1 ) |
p = dd.find( 'p' , attrs = { 'class' : 'releasetime' }) |
release_time = p.get_text() |
#regx = '^.*?:(\d+-\d+-\d).*' |
regx = '^.*?:([0-9-]+).*' |
res = re.match(regx, release_time) |
#print(res.group(1)) |
movie[ 'time' ] = res.group( 1 ) |
regx = '^.*?\((.*)\)' |
res = re.match(regx,release_time) |
if res: |
#print(res.group(1)) |
movie[ 'country' ] = res.group( 1 ) |
p = dd.find( 'p' ,attrs = { 'class' : 'score' }) |
i_int = p.find( 'i' ,attrs = { 'class' : 'integer' }) |
i_fra = p.find( 'i' ,attrs = { 'class' : 'fraction' }) |
score = i_int.get_text() + i_fra.get_text() |
#print(score) |
movie[ 'score' ] = float (score) |
ls.append(movie) |
return ls |
if __name__ = = '__main__' : |
movie_list = [] |
for index in range ( 0 , 10 ): |
html = get_one_page(index) |
movie_list + = get_movie_info(html) |
print (movie_list) |
#使用数据库保存数据 |
print ( len (movie_list)) |
for movie in movie_list: |
save(movie) |
key = input ( '请输入一个关键词' ) |
ls = find_by_title(key) |
for movie in ls : |
print (movie) |