用户注册



邮箱:

密码:

用户登录


邮箱:

密码:
记住登录一个月忘记密码?

发表随想


还能输入:200字
云代码 - python代码库

高级WEB客户端:网络爬虫--crawl.py

2013-02-03 作者: sam_linux举报

[python]代码库

#! /usr/bin/env python

from sys import argv
from os import makedirs,unlink,sep
from os.path import dirname,exists,isdir,splitext
from string import replace,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO

class Retriever(object):    # download web page
    def __init__(self,url):
        self.url = url
	self.file = self.filename(url)

    def filename(self,url,deffile='index.html'):
        parsedurl = urlparse(url,'http',0)    # parse path
	path = parsedurl[1] + parsedurl[2]
	ext = splitext(path)
	if ext[1] == '':    # no file,use default
	    if path[-1] == '/':
	        path += deffile
	    else:
	        path += '/' + deffile
	ldir = dirname(path)    # local directory
        if sep != '/':    # os-indep. path separator.
	    ldir = repalce(ldir,'/',sep)
	if not isdir(ldir):    # create archieve dir if nec.
	    if exists(ldir): unlink(ldir)
	    makedirs(ldir)
	return path

    def download(self):    # download web page
        try:
	    retval = urlretrieve(self.url,self.file)
	except IOError:
	    retval = ('*** ERROR: invalid URL "%s"' % self.url)
	return retval

    def parseAndGetLinks(self):    # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(\
	    DumbWriter(StringIO())))
	self.parser.feed(open(self.file).read())
	self.parser.close()
	return self.parser.anchorlist

class Crawler(object):    # manage entire crawling process
    
    count = 0    # static download page counter

    def __init__(self,url):
        self.q = [url]
	self.seen = []
	self.dom = urlparse(url)[1]

    def getPage(self,url):
        r = Retriever(url)
	retval = r.download()
	if retval[0] == '*':    # error situation, do not parse
	    print retval, '...skipping parse'
	    return
	Crawler.count += 1
	print '\n(', Crawler.count, ')'
	print 'URL:',url
	print 'FILE:',retval[0]
	self.seen.append(url)

	links = r.parseAndGetLinks()    # get and process links
	for eachlink in links:
	    if eachlink[:4] != 'http' and \
	            find(eachlink,'://') == -1:
		eachlink = urljoin(url,eachlink)
	    print '* ',eachlink

	    if find(lower(eachlink),'mailto') != -1:
	        print '...discarded, mailto link'
		continue

	    if eachlink not in self.seen:
	        if find(eachlink,self.dom) == -1:
		    print '...discarded, not in domain'
	        else:
	            if eachlink not in self.q:
		        self.q.append(eachlink)
		        print '...new, added to Q'
		    else:
	                print '...discarded, already in Q'
            else:
	        print '...discarded, already processed'

    def go(self):    # process links in queue
        while self.q:
	    url = self.q.pop()
	    self.getPage(url)

def main():
    if len(argv) > 1:
	url = argv[1]

    else:
	try:
	    url = raw_input('Enter starting URL: ')
	except (KeyboardInterrupt,EOFError):
	    url = ''
    if not url: return
    robot = Crawler(url)
    robot.go()

if __name__ == '__main__':
    main()


网友评论    (发表评论)

共3 条评论 1/1页

发表评论:

评论须知:

  • 1、评论每次加2分,每天上限为30;
  • 2、请文明用语,共同创建干净的技术交流环境;
  • 3、若被发现提交非法信息,评论将会被删除,并且给予扣分处理,严重者给予封号处理;
  • 4、请勿发布广告信息或其他无关评论,否则将会删除评论并扣分,严重者给予封号处理。


扫码下载

加载中,请稍后...

输入口令后可复制整站源码

加载中,请稍后...