from urllib.request import urlopen |
from urllib.parse import urljoin |
from re import findall, sub, S #S表示正则表达式中的.可以匹配换行符 |
from os.path import basename,isdir |
from os import mkdir |
url = r '网站地址' |
root = 'XX新闻' |
if not isdir(root): |
mkdir(root) |
while True : |
with urlopen(url)as fp: |
content = fp.read().decode() |
#提取标题 |
pattern = r '<h1 .+?>(.+?)</h1>' |
title = findall(pattern,content)[ 0 ] |
title = sub(r '<.+?>| ' ,'',title) |
child = rf '{root}\{title}' #在root下创建title文件夹,加个r是不想斜线影响标题的第一个字符 |
if not isdir(child): |
mkdir(child) |
print (title) |
#提取文本 |
pattern = r '<p class="MsoNormal".+?>(.+?)</p>' |
with open (rf '{child}\{title}.txt' , 'w' ,encoding = 'utf-8' ) as fp: |
for item in findall(pattern,content, S): |
item = sub(r '<.+?>| ' ,'',item).strip() |
if item: |
fp.write(item + '\n' ) |
|
#提取图片 |
parttern = r '<img width=.+?src="(.+?)"' |
for item in findall(parttern,content): |
item = urljoin(url,item) |
with urlopen(item) as fp_web: |
with open (rf '{child}\{basename(item)}' , 'wb' ) as fp_local: |
fp_local.write(fp_web.read()) |
#下一条新闻地址 |
pattern = r '下一条:<a href="(.+?)"' |
next_url = findall(pattern,content) |
if not next_url: |
break |
next_url = urljoin(url,next_url[ 0 ]) |
url = next_url |