aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@robocracy.org>2011-05-03 21:05:57 -0400
committerbnewbold <bnewbold@robocracy.org>2011-05-03 21:05:57 -0400
commitcf569b08c0bd923778e476dcae1bbdf93d824502 (patch)
tree6f8eecfacc68760ba2ee0dfc43a474be8face7de
parent9dd97e5467b078aeb5b776ce74149a2307d820dc (diff)
downloadpiccast-cf569b08c0bd923778e476dcae1bbdf93d824502.zip
piccast-cf569b08c0bd923778e476dcae1bbdf93d824502.tar.gz
basic scraper for nzinzi
-rw-r--r--piccast/feeds/management/commands/scrape_feeds.py24
1 files changed, 19 insertions, 5 deletions
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py
index 265efd4..8c3f865 100644
--- a/piccast/feeds/management/commands/scrape_feeds.py
+++ b/piccast/feeds/management/commands/scrape_feeds.py
@@ -24,6 +24,8 @@ def scrape_pics_from_html(pset, html):
# assumptions: one <img> per line, ordering, etc
if(pset.feed.shortname == "vectortut"):
mset = re.finditer('<img src="(http://\S+cloudfront.net\S+)" border=', html)
+ elif(pset.feed.shortname == "nzinzi"):
+ mset = re.finditer('src=[\'\"](http://ntamak.free.fr/\S+/\S+)[\'\"]', html)
else:
mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html)
@@ -43,7 +45,7 @@ def scrape_pics_from_html(pset, html):
#p.caption = ""
p.original_url = m.group(1)
p.source_url = pset.source_url
- if pset.feed.shortname != "vectortut":
+ if pset.feed.shortname not in ("vectortut", "nzinzi"):
p.width = m.group(2)
p.height = m.group(3)
pics.append(p)
@@ -150,7 +152,10 @@ def scrape_feed(feed_shortname):
pset['source_url'] = e.link
pset['created'] = e.date
pset['title'] = e.title
- pset['category'] = e.category
+ if(feed_shortname == "nzinzi"):
+ pset['category'] = "Art"
+ else:
+ pset['category'] = e.category
pset['description'] = e.description
psets.append(pset)
@@ -171,11 +176,15 @@ def scrape_feed(feed_shortname):
p = PicSet()
p.feed = feed
p.source_url = pset['source_url']
- p.created = datetime.strptime(pset['created'][:-6], \
- "%a, %d %b %Y %H:%M:%S")
+ if(feed_shortname == "nzinzi"):
+ p.created = datetime.strptime(pset['created'][:-10], \
+ "%Y-%m-%dT%H:%M:%S")
+ else:
+ p.created = datetime.strptime(pset['created'][:-6], \
+ "%a, %d %b %Y %H:%M:%S")
# This oneline strips any paren content
p.title = pset['title'].split(" (")[0]
- if(feed_shortname in ("butdoesitfloat", "vectortut")):
+ if(feed_shortname in ("butdoesitfloat", "vectortut", "nzinzi")):
# Category is ignored for these sites
p.category = Category.objects.get_or_create(name="Art")[0]
else:
@@ -208,6 +217,11 @@ def scrape_feed(feed_shortname):
p.save()
print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
scrape_pics_bypage(p)
+ elif(feed_shortname == u"nzinzi"):
+ p.description = ""
+ p.save()
+ print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
+ scrape_pics_from_html(p, pset['description'])
else:
print "ERROR: unknown PicFeed: " + pset['feed']
#parse_page(s['source_url'], httplib.HTTPConnection)