From 6413abbd68e6a441fe0df4fe93e9e78e375636dc Mon Sep 17 00:00:00 2001 From: bnewbold Date: Fri, 22 Apr 2011 02:02:09 -0400 Subject: crude feed scraper --- piccast/feeds/management/commands/__init__.py | 0 piccast/feeds/management/commands/scrape_feeds.py | 205 ++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 piccast/feeds/management/commands/__init__.py create mode 100644 piccast/feeds/management/commands/scrape_feeds.py (limited to 'piccast/feeds/management/commands') diff --git a/piccast/feeds/management/commands/__init__.py b/piccast/feeds/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py new file mode 100644 index 0000000..1de9e4c --- /dev/null +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -0,0 +1,205 @@ +import re +import feedparser +import urllib +import sys +from datetime import * + +from django.core.management.base import BaseCommand, CommandError +from feeds.models import * + +# see Command definition at the end + +# this mechanism isn't robust yet b/c any small sets get parsed repeatedly +MIN_SET_SIZE = 1 # Need to find at least this many images for each set + +############################################################################### +def scrape_pics_from_html(pset, html): + + if(type(pset) != PicSet): + raise("Type error, expected PicSet") + + # assumptions: one per line, ordering, etc + mset = re.finditer('\'([^\.]+)\'...\n") + return + + for shortname in args: + try: + scrape_feed(shortname) + except Exception: + sys.stderr.write("Error scraping " + shortname + "\n") + + sys.stdout.flush() + sys.stdout.write('Done scraping feeds.\n') + -- cgit v1.2.3