aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@robocracy.org>2011-05-03 20:24:50 -0400
committerbnewbold <bnewbold@robocracy.org>2011-05-03 20:24:50 -0400
commit9dd97e5467b078aeb5b776ce74149a2307d820dc (patch)
tree8564909703ffccd955bc0dd29697d7a5ac72ed3b
parentd53fcbecdc690bd42372f3ce9c322f849220e225 (diff)
downloadpiccast-9dd97e5467b078aeb5b776ce74149a2307d820dc.zip
piccast-9dd97e5467b078aeb5b776ce74149a2307d820dc.tar.gz
added draft jetfunny scraper
-rw-r--r--piccast/feeds/management/commands/scrape_feeds.py18
1 files changed, 14 insertions, 4 deletions
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py
index a023e9f..265efd4 100644
--- a/piccast/feeds/management/commands/scrape_feeds.py
+++ b/piccast/feeds/management/commands/scrape_feeds.py
@@ -67,7 +67,7 @@ def scrape_pics_from_html(pset, html):
print "Found and saved " + str(len(pics)) + " new Pics"
###############################################################################
-def scrape_pics_acidcow(pset):
+def scrape_pics_bypage(pset):
if(type(pset) != PicSet):
raise("Type error, expected PicSet")
@@ -77,7 +77,12 @@ def scrape_pics_acidcow(pset):
print "Scraping from " + pset.source_url + "..."
html = urllib.urlopen(pset.source_url)
- mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read())
+ if(pset.feed.shortname == "acidcow"):
+ mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read())
+ elif(pset.feed.shortname == "jetfunny"):
+ mset = re.finditer('src=\'(http://jetfunnypictures.com/img/jetfunnypictures.com/\S+)\'', html.read())
+ else:
+ mset = re.finditer('src=[\'\"](http://\S+)[\"\']', html.read())
pics = list()
index = 0
@@ -184,7 +189,7 @@ def scrape_feed(feed_shortname):
p.description = None
p.save()
print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
- scrape_pics_acidcow(p)
+ scrape_pics_bypage(p)
elif(feed_shortname == u"butdoesitfloat"):
# the descriptions here are usually important
# TODO: should comment URL instead of link url (which bounces)?
@@ -193,11 +198,16 @@ def scrape_feed(feed_shortname):
print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
scrape_pics_from_html(p, pset['description'])
elif(feed_shortname == u"vectortut"):
- print pset['description']
p.description = ""
p.save()
print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
scrape_pics_from_html(p, pset['description'])
+ elif(feed_shortname == u"jetfunny"):
+ if(len(pset['description']) > 0):
+ p.description = strip_tags(pset['description'].split("</p>")[0])
+ p.save()
+ print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
+ scrape_pics_bypage(p)
else:
print "ERROR: unknown PicFeed: " + pset['feed']
#parse_page(s['source_url'], httplib.HTTPConnection)