Pythonで書いた簡単なクローラ
import urllib2
import sys, os
import sets
import time
from bs4 import BeautifulSoup
import urlparse
SIZE=1000000
storageDir = 'storage/'
rootPath = None
pushed = sets.Set()
stack = []
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Chrome')]
urllib2.install_opener(opener)
# BASIC AUTH
auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm='Authentication required',
uri='http://example.com/',
user='example',
passwd='dotcom')
opener = urllib2.build_opener(auth_handler)
urllib2.install_opener(opener)
def trim_protocol(url):
if url.startswith("http://"):
return url[7:]
if url.startswith("https://"):
return url[8:]
return url
def getFile(f):
name = storageDir + trim_protocol(f.geturl())
if name.endswith("/"):
name = name + "__root__"
index = name.rfind('/') # creating directory if needed
if index != -1:
if not os.path.exists(name[:index]):
os.makedirs(name[:index])
return open(name, 'w')
def genUrl(urlhead, target):
sindex = target.rfind("#")
if sindex != -1:
target = target[:sindex]
if not target.startswith("javascript") :
if not (target.startswith("http://") or target.startswith("https://") ):
if target.startswith("/"):
return urlroot(urlhead) + target
elif target.startswith("../"):
return genUrl(removeOne(urlhead), target[3:])
elif target.startswith("./"):
return urlhead, target[2:]
else:
return urlhead + target
else:
return target
return None
def removeOne(url):
if url.endswith("/"):
url = url[:-1]
i = url.rindex("/")
return url[:(i+1)]
def links(soup):
for atag in soup.find_all("a"):
try:
# print 'content = ', atag.contents
yield(atag['href'])
except KeyError:
pass # ignore
for imgtag in soup.find_all("img"):
try:
yield(imgtag['src'])
except KeyError:
pass # ignore
for ftag in soup.find_all("FRAME"):
try:
yield(ftag['src'])
except KeyError:
pass # ignore
def process(urlhead, doc):
soup = BeautifulSoup(doc)
for target0 in links(soup):
target = genUrl(urlhead, target0)
if target == None:
continue
if not target in pushed:
if (not rootPath) or target.startswith(rootPath):
print ' pushing ', target
pushed.add(target)
stack.append(target)
def urlhead(name):
index = name.rfind('/')
if name.endswith("/"):
return name
if index != -1:
return name[:(index + 1)]
return name + "/"
def urlroot(name):
o = urlparse.urlparse(name)
return o.scheme + "://" + o.netloc + "/"
def main(start):
print 'starting with ', start
print 'saving to ', storageDir
stack.append(start)
pushed.add(start)
while len(stack) > 0:
time.sleep(1)
item = stack.pop()
try:
f = urllib2.urlopen(item)
print 'accessing ', item, ' , url is ', f.geturl()
if f.info()['Content-Type'].startswith('text/html'):
doc = f.read();
storage = getFile(f)
storage.write(doc)
process(urlhead(f.geturl()), doc)
storage.close()
else: # dump the content
storage = getFile(f)
while True:
buf = f.read(SIZE)
if buf == "":
break
storage.write(buf)
storage.close()
f.close()
except urllib2.HTTPError as e:
print 'failed to get ', item, " , ", e
except AttributeError as e:
print 'failed to handle ', item, " , ", e
except urllib2.URLError as e:
print 'failed to handle ', item, " , ", e
def usage():
print "python crawl.py START_POINT [ROOT_PATH]"
sys.exit()
if __name__ == '__main__' :
if len(sys.argv) < 2:
usage()
if len(sys.argv) >= 3:
rootPath = sys.argv[2]
main(sys.argv[1])
0 件のコメント:
コメントを投稿