import re |
import json |
import urllib |
from urllib.request import urlopen |
from bs4 import BeautifulSoup |
import multiprocessing |
base_url = 'http://study.163.com/course/introduction/' |
local_url = base_url + '100125090.htm' |
res = urlopen(local_url) |
soup = BeautifulSoup(res, 'html.parser' ) |
video_urls = soup.find_all( 'a' , class_ = 'f-thide f-fl' ) |
for item in video_urls: |
video_url = item[ 'href' ] |
print (video_url) |
def crawl(video_url): |
# Get video_info |
res = urlopen(video_url) |
soup = BeautifulSoup(res, 'html.parser' ) |
json_str = soup.find( 'script' , type = 'application/json' ).text |
data = json.loads(json_str) |
# Get videos |
videos = data.get( 'result' , None ).get( 'mpath' , None ) |
for index, video in enumerate (videos): |
print ( 'url:' , video) |
# download |
urllib.request.urlretrieve(video, f 'videos/{index}.mp4' ) |
# Get description |
description = data.get( 'result' , None ).get( 'crDescription' , None ) |
print ( 'description:' , description) |
# Save description |
with open ( 'description.txt' , 'w' ) as f: |
f.write(description) |
# Create four sub-processes |
p1 = multiprocessing.Process(target = crawl, args = (video_url,)) |
p2 = multiprocessing.Process(target = crawl, args = (video_url,)) |
p3 = multiprocessing.Process(target = crawl, args = (video_url,)) |
p4 = multiprocessing.Process(target = crawl, args = (video_url,)) |
# Start sub-processes |
p1.start() |
p2.start() |
p3.start() |
p4.start() |
# Join sub-processes |
p1.join() |
p2.join() |
p3.join() |
p4.join() |