import re
import feedparser
import urllib
import sys
from datetime import *
from django.core.management.base import BaseCommand, CommandError
from django.utils.html import strip_tags
from feeds.models import *
# see Command definition at the end
# this mechanism isn't robust yet b/c any small sets get parsed repeatedly
MIN_SET_SIZE = 3 # Need to find at least this many images for each set
###############################################################################
def scrape_pics_from_html(pset, html):
if(type(pset) != PicSet):
raise("Type error, expected PicSet")
# assumptions: one per line, ordering, etc
mset = re.finditer(' 0):
p.description = strip_tags(pset['description'])
if(p.description.startswith(u"Similar posts:")):
p.description = None
p.save()
print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
scrape_pics_acidcow(p)
elif(feed_shortname == u"butdoesitfloat"):
# the descriptions here are usually important
# TODO: should comment URL instead of link url (which bounces)?
p.description = pset['description'].split("...\n")
return
for shortname in args:
try:
scrape_feed(shortname)
except Exception as e:
sys.stderr.write("Error scraping " + shortname + ":\n")
sys.stderr.write(str(e) + "\n")
sys.stdout.flush()
sys.stdout.write('Done scraping feeds.\n')