diff options
author | bnewbold <bnewbold@robocracy.org> | 2011-05-03 20:24:50 -0400 |
---|---|---|
committer | bnewbold <bnewbold@robocracy.org> | 2011-05-03 20:24:50 -0400 |
commit | 9dd97e5467b078aeb5b776ce74149a2307d820dc (patch) | |
tree | 8564909703ffccd955bc0dd29697d7a5ac72ed3b | |
parent | d53fcbecdc690bd42372f3ce9c322f849220e225 (diff) | |
download | piccast-9dd97e5467b078aeb5b776ce74149a2307d820dc.tar.gz piccast-9dd97e5467b078aeb5b776ce74149a2307d820dc.zip |
added draft jetfunny scraper
-rw-r--r-- | piccast/feeds/management/commands/scrape_feeds.py | 18 |
1 files changed, 14 insertions, 4 deletions
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py index a023e9f..265efd4 100644 --- a/piccast/feeds/management/commands/scrape_feeds.py +++ b/piccast/feeds/management/commands/scrape_feeds.py @@ -67,7 +67,7 @@ def scrape_pics_from_html(pset, html): print "Found and saved " + str(len(pics)) + " new Pics" ############################################################################### -def scrape_pics_acidcow(pset): +def scrape_pics_bypage(pset): if(type(pset) != PicSet): raise("Type error, expected PicSet") @@ -77,7 +77,12 @@ def scrape_pics_acidcow(pset): print "Scraping from " + pset.source_url + "..." html = urllib.urlopen(pset.source_url) - mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read()) + if(pset.feed.shortname == "acidcow"): + mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read()) + elif(pset.feed.shortname == "jetfunny"): + mset = re.finditer('src=\'(http://jetfunnypictures.com/img/jetfunnypictures.com/\S+)\'', html.read()) + else: + mset = re.finditer('src=[\'\"](http://\S+)[\"\']', html.read()) pics = list() index = 0 @@ -184,7 +189,7 @@ def scrape_feed(feed_shortname): p.description = None p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" - scrape_pics_acidcow(p) + scrape_pics_bypage(p) elif(feed_shortname == u"butdoesitfloat"): # the descriptions here are usually important # TODO: should comment URL instead of link url (which bounces)? @@ -193,11 +198,16 @@ def scrape_feed(feed_shortname): print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_from_html(p, pset['description']) elif(feed_shortname == u"vectortut"): - print pset['description'] p.description = "" p.save() print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" scrape_pics_from_html(p, pset['description']) + elif(feed_shortname == u"jetfunny"): + if(len(pset['description']) > 0): + p.description = strip_tags(pset['description'].split("</p>")[0]) + p.save() + print "Great, saved: " + p.title + " (id=" + str(p.id) + ")" + scrape_pics_bypage(p) else: print "ERROR: unknown PicFeed: " + pset['feed'] #parse_page(s['source_url'], httplib.HTTPConnection) |