[python]代码库
#! /usr/bin/env python
from sys import argv
from os import makedirs,unlink,sep
from os.path import dirname,exists,isdir,splitext
from string import replace,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO
class Retriever(object): # download web page
def __init__(self,url):
self.url = url
self.file = self.filename(url)
def filename(self,url,deffile='index.html'):
parsedurl = urlparse(url,'http',0) # parse path
path = parsedurl[1] + parsedurl[2]
ext = splitext(path)
if ext[1] == '': # no file,use default
if path[-1] == '/':
path += deffile
else:
path += '/' + deffile
ldir = dirname(path) # local directory
if sep != '/': # os-indep. path separator.
ldir = repalce(ldir,'/',sep)
if not isdir(ldir): # create archieve dir if nec.
if exists(ldir): unlink(ldir)
makedirs(ldir)
return path
def download(self): # download web page
try:
retval = urlretrieve(self.url,self.file)
except IOError:
retval = ('*** ERROR: invalid URL "%s"' % self.url)
return retval
def parseAndGetLinks(self): # parse HTML, save links
self.parser = HTMLParser(AbstractFormatter(\
DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parser.close()
return self.parser.anchorlist
class Crawler(object): # manage entire crawling process
count = 0 # static download page counter
def __init__(self,url):
self.q = [url]
self.seen = []
self.dom = urlparse(url)[1]
def getPage(self,url):
r = Retriever(url)
retval = r.download()
if retval[0] == '*': # error situation, do not parse
print retval, '...skipping parse'
return
Crawler.count += 1
print '\n(', Crawler.count, ')'
print 'URL:',url
print 'FILE:',retval[0]
self.seen.append(url)
links = r.parseAndGetLinks() # get and process links
for eachlink in links:
if eachlink[:4] != 'http' and \
find(eachlink,'://') == -1:
eachlink = urljoin(url,eachlink)
print '* ',eachlink
if find(lower(eachlink),'mailto') != -1:
print '...discarded, mailto link'
continue
if eachlink not in self.seen:
if find(eachlink,self.dom) == -1:
print '...discarded, not in domain'
else:
if eachlink not in self.q:
self.q.append(eachlink)
print '...new, added to Q'
else:
print '...discarded, already in Q'
else:
print '...discarded, already processed'
def go(self): # process links in queue
while self.q:
url = self.q.pop()
self.getPage(url)
def main():
if len(argv) > 1:
url = argv[1]
else:
try:
url = raw_input('Enter starting URL: ')
except (KeyboardInterrupt,EOFError):
url = ''
if not url: return
robot = Crawler(url)
robot.go()
if __name__ == '__main__':
main()
高级设计师
by: 小蜜锋 发表于:2013-02-03 22:28:23 顶(0) | 踩(0) 回复
回复评论