import re import feedparser import urllib import sys from datetime import * from django.core.management.base import BaseCommand, CommandError from django.utils.html import strip_tags from feeds.models import * # see Command definition at the end # TODO: need a u.encode('utf-8') somewhere in here... # this mechanism isn't robust yet b/c any small sets get parsed repeatedly MIN_SET_SIZE = 3 # Need to find at least this many images for each set ############################################################################### def scrape_pics_from_html(pset, html): if(type(pset) != PicSet): raise("Type error, expected PicSet") # assumptions: one per line, ordering, etc if(pset.feed.shortname == "vectortut"): mset = re.finditer(' $\'([^\.]+)\'$ 0): p.description = strip_tags(pset['description']) if(p.description.startswith(u"Similar posts:")): p.description = ' ' p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_bypage(p) elif(feed_shortname == u"butdoesitfloat"): # the descriptions here are usually important # TODO: should comment URL instead of link url (which bounces)? p.description = pset['description'].split(" 0): p.description = strip_tags(pset['description'].split("

")[0]) p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_bypage(p) elif(feed_shortname == u"nzinzi"): p.description = "" p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_from_html(p, pset['description']) else: print "ERROR: unknown PicFeed: " + pset['feed'] #parse_page(s['source_url'], httplib.HTTPConnection) #break # DEBUG ############################################################################### class Command(BaseCommand): args = '' help = 'Fetches RSS, parses, and possibly scrapes HTML for the given feeds/sources' def handle(self, *args, **options): if len(args) < 1: sys.stderr.write("Need to specify at least one ...\n") return for shortname in args: try: scrape_feed(shortname) except Exception as e: sys.stderr.write("Error scraping " + shortname + ":\n") sys.stderr.write(str(e) + "\n") #raise e sys.stdout.flush() sys.stdout.write('Done scraping feeds.\n')