From 6413abbd68e6a441fe0df4fe93e9e78e375636dc Mon Sep 17 00:00:00 2001 From: bnewbold Date: Fri, 22 Apr 2011 02:02:09 -0400 Subject: crude feed scraper --- piccast/feeds/management/__init__.py | 0 piccast/feeds/management/commands/__init__.py | 0 piccast/feeds/management/commands/scrape_feeds.py | 205 ++++++++++++++++++++++ piccast/feeds/views.py | 3 +- 4 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 piccast/feeds/management/__init__.py create mode 100644 piccast/feeds/management/commands/__init__.py create mode 100644 piccast/feeds/management/commands/scrape_feeds.py (limited to 'piccast/feeds') diff --git a/piccast/feeds/management/__init__.py b/piccast/feeds/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/piccast/feeds/management/commands/__init__.py b/piccast/feeds/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py new file mode 100644 index 0000000..1de9e4c --- /dev/null +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -0,0 +1,205 @@ +import re +import feedparser +import urllib +import sys +from datetime import * + +from django.core.management.base import BaseCommand, CommandError +from feeds.models import * + +# see Command definition at the end + +# this mechanism isn't robust yet b/c any small sets get parsed repeatedly +MIN_SET_SIZE = 1 # Need to find at least this many images for each set + +############################################################################### +def scrape_pics_from_html(pset, html): + + if(type(pset) != PicSet): + raise("Type error, expected PicSet") + + # assumptions: one per line, ordering, etc + mset = re.finditer('\'([^\.]+)\'...\n") + return + + for shortname in args: + try: + scrape_feed(shortname) + except Exception: + sys.stderr.write("Error scraping " + shortname + "\n") + + sys.stdout.flush() + sys.stdout.write('Done scraping feeds.\n') + diff --git a/piccast/feeds/views.py b/piccast/feeds/views.py index de335d6..daa4a28 100644 --- a/piccast/feeds/views.py +++ b/piccast/feeds/views.py @@ -34,7 +34,6 @@ def sets_by_feedslug(request, req_feedslug): json_serializer.serialize(sets) return HttpResponse(json_serializer.getvalue(), mimetype="application/json") - def pics_by_set(request, req_picset): # TODO: validate req_picset pics = Pic.objects.filter(set=req_picset) @@ -42,3 +41,5 @@ def pics_by_set(request, req_picset): json_serializer.serialize(pics) return HttpResponse(json_serializer.getvalue(), mimetype="application/json") +def + -- cgit v1.2.3