
from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import re |
import sqlite3 |
import os |
def get_one_page(index): |
url = 'https://maoyan.com/board/4?offset=%d' % (index * 10) |
# 另一种方式: url = 'https://maoyan.com/board/4?offset={}/'.format(index * 10) |
# 字符串的格式化处理 {} 占位符 使用format方式格式化字符串 |
response = urlopen(url) |
return response.read().decode() |
db_file = 'maoyan.db' |
def create_table(): |
conn = sqlite3.connect(db_file)#1连接数据库 |
cursor = conn.cursor()#2创建执行对象 |
cursor.execute(''' |
create table movie( |
id integer primary key autoincrement , |
title text, |
star text, |
reltime text, |
country text, |
score float |
) |
''')#3 |
conn.commit()#4.提交操作,对于可以修改数据库内容的语句必须提交 |
conn.close()#5.关闭连接 |
def save(movie): |
#连接 |
conn = sqlite3.connect(db_file) |
#创建执行对象 |
cursor = conn.cursor() |
#执行SQL语句 |
cursor.execute(''' |
insert into movie |
(title, star, reltime, country, score) |
values |
(?, ?, ?, ?, ?) |
''',(movie.get('title'),movie.get('star'),movie.get('time'), |
movie.get('country'),movie.get('score'))) |
#提交 |
conn.commit() |
#关闭 |
conn.close() |
# 根据标题关键字查询数据库 |
def find_by_title(key): |
conn = sqlite3.connect(db_file) |
cursor = conn.cursor() |
result = cursor.execute(''' |
select * from movie |
where title like ? |
''', ('%'+key+'%',)) |
#查询不需要提交 |
ls = [] |
for row in result: |
movie = {} |
movie['id'] = row[0] |
movie['title'] = row[1] |
movie['star'] = row[2] |
movie['time'] = row[3] |
movie['country'] = row[4] |
movie['score'] = row[5] |
ls.append(movie) |
conn.close() |
return ls |
if __name__ == '__main__': |
if not os.path.exists(db_file):#若已经存在就不再创建新表 |
create_table() |
#保存 |
#movie = {'title': '霸王别姬', 'star': '张国荣,张丰毅,巩俐', 'time': '1993-01-01', 'score': 9.6} |
#save(movie) |
print(find_by_title('王')) |
''' |
在项目下会出现一个maoyan.db文件 |
1) PyCharm 右侧点击Database |
2) 点击 + 弹出菜单中, 选择Data Source, 再选择sqlite |
3) 如果有 Download missing driver files, 点击 Download |
4) 选择 file 选择需要打开的数据库文件 |
5) 能不能展开, maoyan.db |
''' |
''' |
解析贴吧列表,保存到数据库中''' |
#html = get_one_page(0) |
def get_movie_info(html): |
soup = BeautifulSoup(html, 'html.parser') |
ls_dd = soup.select('.board-wrapper dd') |
ls = []#定义一个空列表,用来存放电影的信息 |
for dd in ls_dd: |
movie = {} |
a = dd.find('a', attrs={"class": 'image-link'}) |
title = a.get('title') |
#print(title) |
movie['title'] = title |
p = dd.find('p', attrs={'class': 'star'}) |
star = p.get_text() |
#print(star) |
# 正则表达式 |
# print(type(star)) |
star = star.replace('\n', '')#去掉字符串结尾的\n |
# print(star) |
regx = '^.*?:(.*?)\s' |
res = re.match(regx, star) |
#print(res.group(1)) |
movie['star'] = res.group(1) |
p = dd.find('p', attrs={'class': 'releasetime'}) |
release_time = p.get_text() |
#regx = '^.*?:(\d+-\d+-\d).*' |
regx = '^.*?:([0-9-]+).*' |
res = re.match(regx, release_time) |
#print(res.group(1)) |
movie['time'] = res.group(1) |
regx = '^.*?\((.*)\)' |
res = re.match(regx,release_time) |
if res: |
#print(res.group(1)) |
movie['country'] = res.group(1) |
p = dd.find('p',attrs={'class' :'score'}) |
i_int = p.find('i',attrs={'class':'integer'}) |
i_fra = p.find('i',attrs={'class': 'fraction'}) |
score = i_int.get_text() + i_fra.get_text() |
#print(score) |
movie['score'] = float(score) |
ls.append(movie) |
return ls |
if __name__ == '__main__': |
movie_list = [] |
for index in range(0,10): |
html = get_one_page(index) |
movie_list += get_movie_info(html) |
print(movie_list) |
#使用数据库保存数据 |
print(len(movie_list)) |
for movie in movie_list: |
save(movie) |
key = input('请输入一个关键词') |
ls = find_by_title(key) |
for movie in ls : |
print(movie) |



