from urlparse import urlparse
from urlparse import urljoin
import urllib2
import copy
import zlib
import sys
import re
class OpenUrl(object):
headers = {"User-Agent":
"Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201"}
def open(self, url):
try:
request = urllib2.Request(url, headers=self.headers)
obj_web = urllib2.urlopen(request)
response = obj_web.info()
data = obj_web.read()
if response.has_key("Content-Encoding"):
if "gzip" in response["Content-Encoding"]:
print "Codificado"
buffer = zlib.decompress(data, 16 + zlib.MAX_WBITS)
else:
buffer = data
except Exception as e:
print "({0}) Error: {1}".format(url, e)
return False
else:
return buffer
class Crawler(OpenUrl):
def __init__(self, url, buscar):
self.host = "http://" + urlparse(url).hostname
self.headers["Referer"] = self.host
self.revisar = set([url])
self.revisados = set()
self.permitidos = ("html", "php", "php5")
self.buscar = buscar
self.externos = set()
self.cola = set()
self.expresion = re.compile(
"(href|src|src)\s*?=\s*?[\"\'](.*?)[\"\']", re.I | re.S)
self.encontrados = dict(map(lambda x: (x, set()), buscar))
def crawl(self, profundidad):
if profundidad > 0:
self.revisar = self.revisar | self.cola
if self.revisar:
print "Preparando para abrir:", len(self.revisar), "links"
for url in copy.copy(self.revisar):
if url in self.revisados:
pass
print "Abriendo:", url
self.revisados.add(url)
self.revisar.remove(url)
data = self.open(url)
if data:
self.parsear(url, data)
self.crawl(profundidad - 1)
def parsear(self, location, data):
for link in self.expresion.findall(data):
absoluto = self.paths(location, link[1])
if absoluto:
partes = urlparse(absoluto).path
extension = partes.split(".")[-1]
if extension in self.buscar:
self.encontrados[extension].add(absoluto)
elif extension in self.permitidos:
self.cola.add(absoluto)
def paths(self, relative, path):
if path.startswith(self.host):
return path
elif path.startswith("http"):
self.externos.add(path)
return False
else:
return urljoin(relative, path)
def mostrar(objeto):
for tag in objeto.encontrados:
print "\r\n" + tag
print "\r\n".join(list(objeto.encontrados[tag]))
def info():
print "Uso: crawler.py <host> <profundidad> <extensiones>"
print "La lista de extensiones a buscar debe estar separada por , (coma)"
try:
parametros = sys.argv[1:]
url = parametros[0]
profundidad = int(parametros[1])
buscar = "".join(parametros[2:])
buscar = buscar.split(",")
except:
info()
exit()
a = Crawler(url, buscar)
try:
a.crawl(profundidad)
except KeyboardInterrupt:
mostrar(a)
exit()
else:
mostrar(a)