import re import feedparser import urllib import sys from datetime import * from django.core.management.base import BaseCommand, CommandError from django.utils.html import strip_tags from feeds.models import * # see Command definition at the end # this mechanism isn't robust yet b/c any small sets get parsed repeatedly MIN_SET_SIZE = 3 # Need to find at least this many images for each set ############################################################################### def scrape_pics_from_html(pset, html): if(type(pset) != PicSet): raise("Type error, expected PicSet") # assumptions: one per line, ordering, etc mset = re.finditer(' $\'([^\.]+)\'$ 0): p.description = strip_tags(pset['description']) if(p.description.startswith(u"Similar posts:")): p.description = None p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_acidcow(p) elif(feed_shortname == u"butdoesitfloat"): # the descriptions here are usually important # TODO: should comment URL instead of link url (which bounces)? p.description = pset['description'].split("...\n") return for shortname in args: try: scrape_feed(shortname) except Exception as e: sys.stderr.write("Error scraping " + shortname + ":\n") sys.stderr.write(str(e) + "\n") sys.stdout.flush() sys.stdout.write('Done scraping feeds.\n')