Crawler

Once · Julio 23, 2014, 11:04:53 PM

Este es un pequeño crawler que busca archivos según su extensión.

Una vez instaciada la clase, se pueden cambiar ciertos atributos, como los headers HTTP que se envian a la web (atributo headers). También se puede consultar la lista de links que apuntan a otra url.

Código: python

from urlparse import urlparse
from urlparse import urljoin

import urllib2
import copy
import zlib
import sys
import re


class OpenUrl(object):
    headers = {"User-Agent":
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201"}
    def open(self, url):
        try:
            request = urllib2.Request(url, headers=self.headers)
            obj_web = urllib2.urlopen(request)
            response = obj_web.info()
            data = obj_web.read()
            
            if response.has_key("Content-Encoding"):
                if "gzip" in response["Content-Encoding"]:
                    print "Codificado"
                    buffer = zlib.decompress(data, 16 + zlib.MAX_WBITS)
            else:
                buffer = data
        except Exception as e:
            print "({0}) Error: {1}".format(url, e)
            return False
        else:
            return buffer


class Crawler(OpenUrl):
    def __init__(self, url, buscar):
        self.host = "http://" + urlparse(url).hostname
        self.headers["Referer"] = self.host
        self.revisar = set([url])
        self.revisados = set()
        self.permitidos = ("html", "php", "php5")
        self.buscar = buscar
        self.externos = set()
        self.cola = set()
        self.expresion = re.compile(
            "(href|src|src)\s*?=\s*?[\"\'](.*?)[\"\']", re.I | re.S)

        self.encontrados = dict(map(lambda x: (x, set()), buscar))

    def crawl(self, profundidad):
        if profundidad > 0:
            self.revisar = self.revisar | self.cola
            if self.revisar:
                print "Preparando para abrir:", len(self.revisar), "links"
                for url in copy.copy(self.revisar):
                    if url in self.revisados:
                        pass
                    print "Abriendo:", url
                    self.revisados.add(url)
                    self.revisar.remove(url)
                    data = self.open(url)
                    if data:
                        self.parsear(url, data)
                self.crawl(profundidad - 1)
    
    def parsear(self, location, data):        
        for link in self.expresion.findall(data):
            absoluto = self.paths(location, link[1])
            
            if absoluto:
                partes = urlparse(absoluto).path
                extension = partes.split(".")[-1]
                if extension in self.buscar:
                    self.encontrados[extension].add(absoluto)
                elif extension in self.permitidos:
                    self.cola.add(absoluto)
    
    def paths(self, relative, path):
        if path.startswith(self.host):
            return path
        elif path.startswith("http"):
            self.externos.add(path)
            return False
        else:
            return urljoin(relative, path)
            

def mostrar(objeto):
    for tag in objeto.encontrados:
        print "\r\n" + tag
        print "\r\n".join(list(objeto.encontrados[tag]))
    

def info():
    print "Uso: crawler.py <host> <profundidad> <extensiones>"
    print "La lista de extensiones a buscar debe estar separada por , (coma)"

try:
    parametros = sys.argv[1:]
    url = parametros[0]
    profundidad = int(parametros[1])
    buscar = "".join(parametros[2:])
    buscar = buscar.split(",")
except:
    info()
    exit()


a = Crawler(url, buscar)
try:
    a.crawl(profundidad)
except KeyboardInterrupt:
    mostrar(a)
    exit()
else:
    mostrar(a)

Saludos!

ANTRAX · Julio 23, 2014, 11:34:53 PM

Genio Once!!!!!!!
Con cambio de nick y todo!

Saludos!
ANTRAX

Once · Julio 24, 2014, 03:00:44 PM

You are not allowed to view links. You are not allowed to view links. Register or Login or You are not allowed to view links. Register or Login
Genio Once!!!!!!!
Con cambio de nick y todo!

Saludos!
ANTRAX

Jajjaja, así es bro, con cambio de nick y todo.

Saludos!

WhiZ · Julio 24, 2014, 07:02:00 PM

Excelente Once!!! Hace un tiempo atrás, cuando hacíamos los retos, charlamos acerca de hacer un crawler en python. Al final, nunca me puse pero por suerte vos sí jeje.

Gracias por compartir!

Saludos!
WhiZ

Crawler

Once

Julio 23, 2014, 11:04:53 PM

ANTRAX

Julio 23, 2014, 11:34:53 PM #1

Once

Julio 24, 2014, 03:00:44 PM #2

WhiZ

Julio 24, 2014, 07:02:00 PM #3