diff options
author | bnewbold <bnewbold@robocracy.org> | 2011-05-03 19:56:32 -0400 |
---|---|---|
committer | bnewbold <bnewbold@robocracy.org> | 2011-05-03 19:56:32 -0400 |
commit | d53fcbecdc690bd42372f3ce9c322f849220e225 (patch) | |
tree | 86a72e9b2a3fcd5107af11fa12a807258bc3334b | |
parent | a8cf5584204d77bd3f234fe579844ca84bb3558d (diff) | |
download | piccast-d53fcbecdc690bd42372f3ce9c322f849220e225.tar.gz piccast-d53fcbecdc690bd42372f3ce9c322f849220e225.zip |
basic vectortut scraper added
-rw-r--r-- | piccast/feeds/management/commands/scrape_feeds.py | 25 |
1 files changed, 20 insertions, 5 deletions
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py index bfa9bfc..a023e9f 100644 --- a/piccast/feeds/management/commands/scrape_feeds.py +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -10,6 +10,8 @@ from feeds.models import * # see Command definition at the end +# TODO: need a u.encode('utf-8') somewhere in here... + # this mechanism isn't robust yet b/c any small sets get parsed repeatedly MIN_SET_SIZE = 3 # Need to find at least this many images for each set @@ -20,7 +22,10 @@ def scrape_pics_from_html(pset, html): raise("Type error, expected PicSet") # assumptions: one <img> per line, ordering, etc - mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html) + if(pset.feed.shortname == "vectortut"): + mset = re.finditer('<img src="(http://\S+cloudfront.net\S+)" border=', html) + else: + mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html) pics = list() index = 0 @@ -38,8 +43,9 @@ def scrape_pics_from_html(pset, html): #p.caption = "" p.original_url = m.group(1) p.source_url = pset.source_url - p.width = m.group(2) - p.height = m.group(3) + if pset.feed.shortname != "vectortut": + p.width = m.group(2) + p.height = m.group(3) pics.append(p) if(len(pics) < MIN_SET_SIZE): @@ -148,6 +154,8 @@ def scrape_feed(feed_shortname): if(feed_shortname == u"acidcow"): psets = filter(lambda s: s['category'] in (u'Pics', u'Picdump',u'Celebs',u'Girls',u'Cars'), psets) + elif(feed_shortname == u"vectortut"): + psets = filter(lambda s: s['category'] in (u'Inspirational'), psets) new_psets = filter(lambda s: \ 0 == len(PicSet.objects.filter(source_url=s['source_url'])), \ @@ -162,8 +170,8 @@ def scrape_feed(feed_shortname): "%a, %d %b %Y %H:%M:%S") # This oneline strips any paren content p.title = pset['title'].split(" (")[0] - if(feed_shortname == u"butdoesitfloat"): - # Category is a list for this site + if(feed_shortname in ("butdoesitfloat", "vectortut")): + # Category is ignored for these sites p.category = Category.objects.get_or_create(name="Art")[0] else: p.category = Category.objects.get_or_create(name=pset['category'])[0] @@ -184,6 +192,12 @@ def scrape_feed(feed_shortname): p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_from_html(p, pset['description']) + elif(feed_shortname == u"vectortut"): + print pset['description'] + p.description = "" + p.save() + print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" + scrape_pics_from_html(p, pset['description']) else: print "ERROR: unknown PicFeed: " + pset['feed'] #parse_page(s['source_url'], httplib.HTTPConnection) @@ -207,6 +221,7 @@ class Command(BaseCommand): except Exception as e: sys.stderr.write("Error scraping " + shortname + ":\n") sys.stderr.write(str(e) + "\n") + #raise e sys.stdout.flush() sys.stdout.write('Done scraping feeds.\n') |