Request HTML Celery, RabbitMQ, and Redis

 

To run need to do two things (while in folder of python filename.py):

1)  Run worker by executing the python program with the “worker” arguement:

$ celery –A tasks worker –loglevel=info

2)  Call the task aka run:

$ python filename.py

#######################
#   grabhtml.py

import requests

from html import unescape

class GrabHTML(object):
    def __init__(self):
        pass

    def get_html(self, url):
        headers = {'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 
            'AppleWebKit/537.36 (KHTML, like Gecko) ' 
            'Chrome/57.0.2960.0 Safari/537.36'}

        r = requests.get(url, headers=headers, timeout=2.0)

        try:
            html = unescape(r.content)
        except:
            html = r.content
        return html, r.status_code

 

#####################
#   celery_grabhtml_redis.py
#   from grabhtml import GrabHTML   # need to import class if using seperate files

from celery import Celery
#   pip install celery==3.1.21

app = Celery('tasks', broker='amqp://localhost/', backend='redis://localhost/6')

#   using docker for both broker and backend
#   docker run -d --hostname my-rabbit --name some-rabbit rabbitmq:management
#   docker pull redis
 
@app.task
def scrape(url):
    print("-> Starting: [{}]".format(url))
    parser = GrabHTML()
    try:
        html, status = parser.get_html(url)
        if status == 200:
            app.backend.set(str(url), html)
        print("-> saved html: [{}]".format(url))
    except:
        print('error: {}'.format(url))
        r.set("error:"+str(url),"")
#   celery -A celery_grabhtml_redis worker --loglevel=info
#   ^ run above celery command in terminal while situated in same folder as current file

#   from celery.task.control import discard_all
#   discard_all()
#   ^ use above to clear celery queue

 

 

######################
#   celery_grabhtml_redis.py
#   from celery_grabhtml_redis import scrape    #   uncomment if choose seperate file

file = r'b:Curls.txt'

urls = []

with open(file, 'r') as f:
    urls = f.read().splitlines()


def produce():
    for url in urls:
        try:
            scrape.delay(str(url))
            print("* Submitted: [{}]".format(url))
        except:
            print("ERROR ", url)

produce()

 

#####################
#   urls.txt    # example

    #http://www.vanguard.com

    #http://www.blackrock.com

    #http://www.fidelity.com

    #http://www.nbim.no

    #http://www.troweprice.com

    #http://www.wellington.com

    #http://www.northerntrust.com

    #http://www.mfs.com

    #http://www.jpmorganfunds.com

    #http://www.us.dimensional.com

    #http://www.lgim.com

    #http://www.tiaa-cref.org

    #http://www.invesco.com

    #http://www.mcm.com

    #http://www.geodecapital.com

    #http://www.columbiamanagement.com

    #http://www.dodgeandcox.com

    #http://www.oppenheimerfunds.com

    #http://www.alliancebernstein.com

    #http://www.apg.nl

    #http://www.franklintempleton.com

    #http://www.jennison.com

    #http://www.gsam.com