diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | piccast/feeds/management/__init__.py | 0 | ||||
-rw-r--r-- | piccast/feeds/management/commands/__init__.py | 0 | ||||
-rw-r--r-- | piccast/feeds/management/commands/scrape_feeds.py | 205 | ||||
-rw-r--r-- | piccast/feeds/views.py | 3 | ||||
-rw-r--r-- | piccast/urls.py | 4 |
6 files changed, 211 insertions, 3 deletions
@@ -6,4 +6,6 @@ .* *.tmp *.old +*.db +settings.py diff --git a/piccast/feeds/management/__init__.py b/piccast/feeds/management/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/piccast/feeds/management/__init__.py diff --git a/piccast/feeds/management/commands/__init__.py b/piccast/feeds/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/piccast/feeds/management/commands/__init__.py diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py new file mode 100644 index 0000000..1de9e4c --- /dev/null +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -0,0 +1,205 @@ +import re +import feedparser +import urllib +import sys +from datetime import * + +from django.core.management.base import BaseCommand, CommandError +from feeds.models import * + +# see Command definition at the end + +# this mechanism isn't robust yet b/c any small sets get parsed repeatedly +MIN_SET_SIZE = 1 # Need to find at least this many images for each set + +############################################################################### +def scrape_pics_from_html(pset, html): + + if(type(pset) != PicSet): + raise("Type error, expected PicSet") + + # assumptions: one <img> per line, ordering, etc + mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html) + + pics = list() + index = 0 + for m in mset: + index += 1 + p = Pic() + #print m.group(1) + #print m.group(2) + # This one line strips any paren content; disabled because it's just the + # set title repeated + #p.title = m.group(2).split(" (")[0] + p.title = "#" + str(index) + p.set = pset + # empty string isn't the same as leaving this null + #p.caption = "" + p.original_url = m.group(1) + p.source_url = pset.source_url + p.width = m.group(2) + p.height = m.group(3) + pics.append(p) + + if(len(pics) < MIN_SET_SIZE): + print "Didn't find enough pictures to save this set (found " + \ + str(len(pics)) + ", MIN_SET_SIZE=" + str(MIN_SET_SIZE) + ")" + pset.delete() + return + + # TODO: oh boy, serial, this is a horrible way to do it! + for p in pics: + p.save() + + # add a thumbnail image for the picset (TODO: resize thumbnail?) + if not pset.image: + pset.image = pics[0] + pset.save() + + print "Found and saved " + str(len(pics)) + " new Pics" + +############################################################################### +def scrape_pics_acidcow(pset): + + if(type(pset) != PicSet): + raise("Type error, expected PicSet") + + #print "DEBUG" + #html = open("./19340-evolution-16-pics.html") + print "Scraping from " + pset.source_url + "..." + html = urllib.urlopen(pset.source_url) + + mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read()) + + pics = list() + index = 0 + for m in mset: + index += 1 + p = Pic() + #print m.group(1) + #print m.group(2) + # This oneline strips any paren content; disabled because it's just the + # set title repeated + #p.title = m.group(2).split(" (")[0] + p.title = "#" + str(index) + p.set = pset + p.caption = "" + p.original_url = m.group(1) + p.source_url = pset.source_url + pics.append(p) + + # close the connection + html.close() + + if(len(pics) < MIN_SET_SIZE): + print "Didn't find enough pictures to save this set (found " + \ + str(len(pics)) + ", MIN_SET_SIZE=" + str(MIN_SET_SIZE) + ")" + pset.delete() + return + + # TODO: oh boy, serial, this is a horrible way to do it! + for p in pics: + p.save() + + # add a thumbnail image for the picset (TODO: resize thumbnail?) + if not pset.image: + pset.image = pics[0] + pset.save() + + print "Found and saved " + str(len(pics)) + " new Pics" + + +############################################################################### +def scrape_feed(feed_shortname): + + try: + feed = PicFeed.objects.get(shortname=feed_shortname) + except Exception as e: + sys.stderr.write("Error finding feed by shortname: " + \ + feed_shortname + "\n") + raise e + + print "Fetching feed for " + feed_shortname + ": " + feed.rssfeed_url + f = feedparser.parse(feed.rssfeed_url) + #print "DEBUG" + #f = feedparser.parse("./acidcow_com?format=xml") + #f = feedparser.parse("./ButDoesItFloat?format=xml") + + psets = list() + for e in f['entries']: + pset = dict() + if(feed_shortname == "acidcow"): + pset['source_url'] = e.guid + elif(feed_shortname == "butdoesitfloat"): + pset['source_url'] = e.guid + else: + pset['source_url'] = e.link + pset['created'] = e.date + pset['title'] = e.title + pset['category'] = e.category + pset['description'] = e.description + psets.append(pset) + + # clean up parser + del f + + if(feed_shortname == u"acidcow"): + psets = filter(lambda s: s['category'] in (u'Pics', u'Picdump'), psets) + + new_psets = filter(lambda s: \ + 0 == len(PicSet.objects.filter(source_url=s['source_url'])), \ + psets) + + for pset in new_psets: + print "Parsing PicSet: " + pset['title'] + " (" + pset['category'] + ")" + p = PicSet() + p.feed = feed + p.source_url = pset['source_url'] + p.created = datetime.strptime(pset['created'][:-6], \ + "%a, %d %b %Y %H:%M:%S") + # This oneline strips any paren content + p.title = pset['title'].split(" (")[0] + if(feed_shortname == u"butdoesitfloat"): + # Category is a list for this site + p.category = Category.objects.get_or_create(name="Art")[0] + else: + p.category = Category.objects.get_or_create(name=pset['category'])[0] + + # Ok, this is where we split out and do custom, per-site processing + if(feed_shortname == u"acidcow"): + p.save() + print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" + scrape_pics_acidcow(p) + elif(feed_shortname == u"butdoesitfloat"): + # the descriptions here are usually important + # TODO: should comment URL instead of link url (which bounces)? + p.description = pset['description'].split("<img")[0] + p.save() + print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" + scrape_pics_from_html(p, pset['description']) + else: + print "ERROR: unknown PicFeed: " + pset['feed'] + #parse_page(s['source_url'], httplib.HTTPConnection) + break # DEBUG + + +############################################################################### + +class Command(BaseCommand): + args = '<feed_shortname feed_shortname ...>' + help = 'Fetches RSS, parses, and possibly scrapes HTML for the given feeds/sources' + + def handle(self, *args, **options): + if len(args) < 1: + sys.stderr.write("Need to specify at least one <feed_shortname>...\n") + return + + for shortname in args: + try: + scrape_feed(shortname) + except Exception: + sys.stderr.write("Error scraping " + shortname + "\n") + + sys.stdout.flush() + sys.stdout.write('Done scraping feeds.\n') + diff --git a/piccast/feeds/views.py b/piccast/feeds/views.py index de335d6..daa4a28 100644 --- a/piccast/feeds/views.py +++ b/piccast/feeds/views.py @@ -34,7 +34,6 @@ def sets_by_feedslug(request, req_feedslug): json_serializer.serialize(sets) return HttpResponse(json_serializer.getvalue(), mimetype="application/json") - def pics_by_set(request, req_picset): # TODO: validate req_picset pics = Pic.objects.filter(set=req_picset) @@ -42,3 +41,5 @@ def pics_by_set(request, req_picset): json_serializer.serialize(pics) return HttpResponse(json_serializer.getvalue(), mimetype="application/json") +def + diff --git a/piccast/urls.py b/piccast/urls.py index 0979de6..daa67da 100644 --- a/piccast/urls.py +++ b/piccast/urls.py @@ -8,8 +8,8 @@ admin.autodiscover() databrowse.site.register(PicFeed) databrowse.site.register(PicSet) -#databrowse.site.register(Pic) -#databrowse.site.register(Category) +databrowse.site.register(Pic) +databrowse.site.register(Category) urlpatterns = patterns('', (r'^admin/doc/', include('django.contrib.admindocs.urls')), |