From d53fcbecdc690bd42372f3ce9c322f849220e225 Mon Sep 17 00:00:00 2001 From: bnewbold Date: Tue, 3 May 2011 19:56:32 -0400 Subject: basic vectortut scraper added --- piccast/feeds/management/commands/scrape_feeds.py | 25 ++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'piccast') diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py index bfa9bfc..a023e9f 100644 --- a/piccast/feeds/management/commands/scrape_feeds.py +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -10,6 +10,8 @@ from feeds.models import * # see Command definition at the end +# TODO: need a u.encode('utf-8') somewhere in here... + # this mechanism isn't robust yet b/c any small sets get parsed repeatedly MIN_SET_SIZE = 3 # Need to find at least this many images for each set @@ -20,7 +22,10 @@ def scrape_pics_from_html(pset, html): raise("Type error, expected PicSet") # assumptions: one per line, ordering, etc - mset = re.finditer('