from urllib.request import urlretrieve |
from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import os |
downloadDirectory = "download" |
baseUrl = "http://www.pythonscraping.com/" |
def getAbsoluteURL(baseUrl,source): |
if source.startswith( "http://www." ): |
url = "http://" + source[ 11 :] # 例如http://orielly.com |
elif source.startswith( "http://" ): |
url = source |
elif source.startswith( "www." ): |
url = "http://" + source[ 4 :] |
else : |
url = baseUrl + "/" + source |
if baseUrl not in url: |
return None |
return url |
def getDownloadPath(baseUrl,absoluteUrl,downloadDirectory): |
path = absoluteUrl.replace( "www." ,"") # 例如 http://orielly.com |
path = path.replace(baseUrl,"") # 去掉http://www.pythonscraping.com/ |
path = downloadDirectory + path |
directory = os.path.dirname(path) # 获取文件路径所在的目录 |
if not os.path.exists(directory): # os.path.exists是否存在次目录,存在就返回True |
os.mkdir(directory) |
return path |
html = urlopen( "http://www.pythonscraping.com/" ) |
bsObj = BeautifulSoup(html, 'html.parser' ) |
downloadList = bsObj.findAll(src = True ) # 找到所有的src的值? |
for download in downloadList: |
fileUrl = getAbsoluteURL(baseUrl,download[ "src" ]) |
if fileUrl is not None : |
print (fileUrl) |
urlretrieve(fileUrl,getDownloadPath(baseUrl,fileUrl,downloadDirectory)) |