diff options
author | bnewbold <bnewbold@robocracy.org> | 2011-05-03 21:05:57 -0400 |
---|---|---|
committer | bnewbold <bnewbold@robocracy.org> | 2011-05-03 21:05:57 -0400 |
commit | cf569b08c0bd923778e476dcae1bbdf93d824502 (patch) | |
tree | 6f8eecfacc68760ba2ee0dfc43a474be8face7de | |
parent | 9dd97e5467b078aeb5b776ce74149a2307d820dc (diff) | |
download | piccast-cf569b08c0bd923778e476dcae1bbdf93d824502.tar.gz piccast-cf569b08c0bd923778e476dcae1bbdf93d824502.zip |
basic scraper for nzinzi
-rw-r--r-- | piccast/feeds/management/commands/scrape_feeds.py | 24 |
1 files changed, 19 insertions, 5 deletions
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py index 265efd4..8c3f865 100644 --- a/piccast/feeds/management/commands/scrape_feeds.py +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -24,6 +24,8 @@ def scrape_pics_from_html(pset, html): # assumptions: one <img> per line, ordering, etc if(pset.feed.shortname == "vectortut"): mset = re.finditer('<img src="(http://\S+cloudfront.net\S+)" border=', html) + elif(pset.feed.shortname == "nzinzi"): + mset = re.finditer('src=[\'\"](http://ntamak.free.fr/\S+/\S+)[\'\"]', html) else: mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html) @@ -43,7 +45,7 @@ def scrape_pics_from_html(pset, html): #p.caption = "" p.original_url = m.group(1) p.source_url = pset.source_url - if pset.feed.shortname != "vectortut": + if pset.feed.shortname not in ("vectortut", "nzinzi"): p.width = m.group(2) p.height = m.group(3) pics.append(p) @@ -150,7 +152,10 @@ def scrape_feed(feed_shortname): pset['source_url'] = e.link pset['created'] = e.date pset['title'] = e.title - pset['category'] = e.category + if(feed_shortname == "nzinzi"): + pset['category'] = "Art" + else: + pset['category'] = e.category pset['description'] = e.description psets.append(pset) @@ -171,11 +176,15 @@ def scrape_feed(feed_shortname): p = PicSet() p.feed = feed p.source_url = pset['source_url'] - p.created = datetime.strptime(pset['created'][:-6], \ - "%a, %d %b %Y %H:%M:%S") + if(feed_shortname == "nzinzi"): + p.created = datetime.strptime(pset['created'][:-10], \ + "%Y-%m-%dT%H:%M:%S") + else: + p.created = datetime.strptime(pset['created'][:-6], \ + "%a, %d %b %Y %H:%M:%S") # This oneline strips any paren content p.title = pset['title'].split(" (")[0] - if(feed_shortname in ("butdoesitfloat", "vectortut")): + if(feed_shortname in ("butdoesitfloat", "vectortut", "nzinzi")): # Category is ignored for these sites p.category = Category.objects.get_or_create(name="Art")[0] else: @@ -208,6 +217,11 @@ def scrape_feed(feed_shortname): p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_bypage(p) + elif(feed_shortname == u"nzinzi"): + p.description = "" + p.save() + print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" + scrape_pics_from_html(p, pset['description']) else: print "ERROR: unknown PicFeed: " + pset['feed'] #parse_page(s['source_url'], httplib.HTTPConnection) |