diff options
| author | bnewbold <bnewbold@robocracy.org> | 2011-05-03 21:05:57 -0400 | 
|---|---|---|
| committer | bnewbold <bnewbold@robocracy.org> | 2011-05-03 21:05:57 -0400 | 
| commit | cf569b08c0bd923778e476dcae1bbdf93d824502 (patch) | |
| tree | 6f8eecfacc68760ba2ee0dfc43a474be8face7de | |
| parent | 9dd97e5467b078aeb5b776ce74149a2307d820dc (diff) | |
| download | piccast-cf569b08c0bd923778e476dcae1bbdf93d824502.tar.gz piccast-cf569b08c0bd923778e476dcae1bbdf93d824502.zip  | |
basic scraper for nzinzi
| -rw-r--r-- | piccast/feeds/management/commands/scrape_feeds.py | 24 | 
1 files changed, 19 insertions, 5 deletions
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py index 265efd4..8c3f865 100644 --- a/piccast/feeds/management/commands/scrape_feeds.py +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -24,6 +24,8 @@ def scrape_pics_from_html(pset, html):      # assumptions: one <img> per line, ordering, etc      if(pset.feed.shortname == "vectortut"):          mset = re.finditer('<img src="(http://\S+cloudfront.net\S+)" border=', html) +    elif(pset.feed.shortname == "nzinzi"): +        mset = re.finditer('src=[\'\"](http://ntamak.free.fr/\S+/\S+)[\'\"]', html)      else:          mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html) @@ -43,7 +45,7 @@ def scrape_pics_from_html(pset, html):          #p.caption = ""          p.original_url = m.group(1)          p.source_url = pset.source_url -        if pset.feed.shortname != "vectortut":  +        if pset.feed.shortname not in ("vectortut", "nzinzi"):               p.width = m.group(2)              p.height = m.group(3)          pics.append(p) @@ -150,7 +152,10 @@ def scrape_feed(feed_shortname):              pset['source_url'] = e.link          pset['created'] = e.date          pset['title'] = e.title -        pset['category'] = e.category +        if(feed_shortname == "nzinzi"): +            pset['category'] = "Art" +        else: +            pset['category'] = e.category          pset['description'] = e.description          psets.append(pset) @@ -171,11 +176,15 @@ def scrape_feed(feed_shortname):          p = PicSet()          p.feed = feed          p.source_url = pset['source_url'] -        p.created = datetime.strptime(pset['created'][:-6], \ -            "%a, %d %b %Y %H:%M:%S") +        if(feed_shortname == "nzinzi"): +            p.created = datetime.strptime(pset['created'][:-10], \ +                "%Y-%m-%dT%H:%M:%S") +        else: +            p.created = datetime.strptime(pset['created'][:-6], \ +                "%a, %d %b %Y %H:%M:%S")          # This oneline strips any paren content          p.title = pset['title'].split(" (")[0] -        if(feed_shortname in ("butdoesitfloat", "vectortut")): +        if(feed_shortname in ("butdoesitfloat", "vectortut", "nzinzi")):              # Category is ignored for these sites              p.category = Category.objects.get_or_create(name="Art")[0]          else: @@ -208,6 +217,11 @@ def scrape_feed(feed_shortname):              p.save()              print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"              scrape_pics_bypage(p) +        elif(feed_shortname == u"nzinzi"): +            p.description = "" +            p.save() +            print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" +            scrape_pics_from_html(p, pset['description'])          else:              print "ERROR: unknown PicFeed: " + pset['feed']          #parse_page(s['source_url'], httplib.HTTPConnection)  | 
