aboutsummaryrefslogtreecommitdiffstats
path: root/piccast/feeds/management/commands
diff options
context:
space:
mode:
authorbnewbold <bnewbold@robocracy.org>2011-04-22 02:02:09 -0400
committerbnewbold <bnewbold@robocracy.org>2011-04-22 02:02:09 -0400
commit6413abbd68e6a441fe0df4fe93e9e78e375636dc (patch)
tree4baa22d55982aa170698690da809e7ef9480e2a8 /piccast/feeds/management/commands
parent08d8e7ebf5845ea8e7dd2e0eaaf61b3f9a8ea1ce (diff)
downloadpiccast-6413abbd68e6a441fe0df4fe93e9e78e375636dc.zip
piccast-6413abbd68e6a441fe0df4fe93e9e78e375636dc.tar.gz
crude feed scraper
Diffstat (limited to 'piccast/feeds/management/commands')
-rw-r--r--piccast/feeds/management/commands/__init__.py0
-rw-r--r--piccast/feeds/management/commands/scrape_feeds.py205
2 files changed, 205 insertions, 0 deletions
diff --git a/piccast/feeds/management/commands/__init__.py b/piccast/feeds/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/piccast/feeds/management/commands/__init__.py
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py
new file mode 100644
index 0000000..1de9e4c
--- /dev/null
+++ b/piccast/feeds/management/commands/scrape_feeds.py
@@ -0,0 +1,205 @@
+import re
+import feedparser
+import urllib
+import sys
+from datetime import *
+
+from django.core.management.base import BaseCommand, CommandError
+from feeds.models import *
+
+# see Command definition at the end
+
+# this mechanism isn't robust yet b/c any small sets get parsed repeatedly
+MIN_SET_SIZE = 1 # Need to find at least this many images for each set
+
+###############################################################################
+def scrape_pics_from_html(pset, html):
+
+ if(type(pset) != PicSet):
+ raise("Type error, expected PicSet")
+
+ # assumptions: one <img> per line, ordering, etc
+ mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html)
+
+ pics = list()
+ index = 0
+ for m in mset:
+ index += 1
+ p = Pic()
+ #print m.group(1)
+ #print m.group(2)
+ # This one line strips any paren content; disabled because it's just the
+ # set title repeated
+ #p.title = m.group(2).split(" (")[0]
+ p.title = "#" + str(index)
+ p.set = pset
+ # empty string isn't the same as leaving this null
+ #p.caption = ""
+ p.original_url = m.group(1)
+ p.source_url = pset.source_url
+ p.width = m.group(2)
+ p.height = m.group(3)
+ pics.append(p)
+
+ if(len(pics) < MIN_SET_SIZE):
+ print "Didn't find enough pictures to save this set (found " + \
+ str(len(pics)) + ", MIN_SET_SIZE=" + str(MIN_SET_SIZE) + ")"
+ pset.delete()
+ return
+
+ # TODO: oh boy, serial, this is a horrible way to do it!
+ for p in pics:
+ p.save()
+
+ # add a thumbnail image for the picset (TODO: resize thumbnail?)
+ if not pset.image:
+ pset.image = pics[0]
+ pset.save()
+
+ print "Found and saved " + str(len(pics)) + " new Pics"
+
+###############################################################################
+def scrape_pics_acidcow(pset):
+
+ if(type(pset) != PicSet):
+ raise("Type error, expected PicSet")
+
+ #print "DEBUG"
+ #html = open("./19340-evolution-16-pics.html")
+ print "Scraping from " + pset.source_url + "..."
+ html = urllib.urlopen(pset.source_url)
+
+ mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read())
+
+ pics = list()
+ index = 0
+ for m in mset:
+ index += 1
+ p = Pic()
+ #print m.group(1)
+ #print m.group(2)
+ # This oneline strips any paren content; disabled because it's just the
+ # set title repeated
+ #p.title = m.group(2).split(" (")[0]
+ p.title = "#" + str(index)
+ p.set = pset
+ p.caption = ""
+ p.original_url = m.group(1)
+ p.source_url = pset.source_url
+ pics.append(p)
+
+ # close the connection
+ html.close()
+
+ if(len(pics) < MIN_SET_SIZE):
+ print "Didn't find enough pictures to save this set (found " + \
+ str(len(pics)) + ", MIN_SET_SIZE=" + str(MIN_SET_SIZE) + ")"
+ pset.delete()
+ return
+
+ # TODO: oh boy, serial, this is a horrible way to do it!
+ for p in pics:
+ p.save()
+
+ # add a thumbnail image for the picset (TODO: resize thumbnail?)
+ if not pset.image:
+ pset.image = pics[0]
+ pset.save()
+
+ print "Found and saved " + str(len(pics)) + " new Pics"
+
+
+###############################################################################
+def scrape_feed(feed_shortname):
+
+ try:
+ feed = PicFeed.objects.get(shortname=feed_shortname)
+ except Exception as e:
+ sys.stderr.write("Error finding feed by shortname: " + \
+ feed_shortname + "\n")
+ raise e
+
+ print "Fetching feed for " + feed_shortname + ": " + feed.rssfeed_url
+ f = feedparser.parse(feed.rssfeed_url)
+ #print "DEBUG"
+ #f = feedparser.parse("./acidcow_com?format=xml")
+ #f = feedparser.parse("./ButDoesItFloat?format=xml")
+
+ psets = list()
+ for e in f['entries']:
+ pset = dict()
+ if(feed_shortname == "acidcow"):
+ pset['source_url'] = e.guid
+ elif(feed_shortname == "butdoesitfloat"):
+ pset['source_url'] = e.guid
+ else:
+ pset['source_url'] = e.link
+ pset['created'] = e.date
+ pset['title'] = e.title
+ pset['category'] = e.category
+ pset['description'] = e.description
+ psets.append(pset)
+
+ # clean up parser
+ del f
+
+ if(feed_shortname == u"acidcow"):
+ psets = filter(lambda s: s['category'] in (u'Pics', u'Picdump'), psets)
+
+ new_psets = filter(lambda s: \
+ 0 == len(PicSet.objects.filter(source_url=s['source_url'])), \
+ psets)
+
+ for pset in new_psets:
+ print "Parsing PicSet: " + pset['title'] + " (" + pset['category'] + ")"
+ p = PicSet()
+ p.feed = feed
+ p.source_url = pset['source_url']
+ p.created = datetime.strptime(pset['created'][:-6], \
+ "%a, %d %b %Y %H:%M:%S")
+ # This oneline strips any paren content
+ p.title = pset['title'].split(" (")[0]
+ if(feed_shortname == u"butdoesitfloat"):
+ # Category is a list for this site
+ p.category = Category.objects.get_or_create(name="Art")[0]
+ else:
+ p.category = Category.objects.get_or_create(name=pset['category'])[0]
+
+ # Ok, this is where we split out and do custom, per-site processing
+ if(feed_shortname == u"acidcow"):
+ p.save()
+ print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
+ scrape_pics_acidcow(p)
+ elif(feed_shortname == u"butdoesitfloat"):
+ # the descriptions here are usually important
+ # TODO: should comment URL instead of link url (which bounces)?
+ p.description = pset['description'].split("<img")[0]
+ p.save()
+ print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
+ scrape_pics_from_html(p, pset['description'])
+ else:
+ print "ERROR: unknown PicFeed: " + pset['feed']
+ #parse_page(s['source_url'], httplib.HTTPConnection)
+ break # DEBUG
+
+
+###############################################################################
+
+class Command(BaseCommand):
+ args = '<feed_shortname feed_shortname ...>'
+ help = 'Fetches RSS, parses, and possibly scrapes HTML for the given feeds/sources'
+
+ def handle(self, *args, **options):
+ if len(args) < 1:
+ sys.stderr.write("Need to specify at least one <feed_shortname>...\n")
+ return
+
+ for shortname in args:
+ try:
+ scrape_feed(shortname)
+ except Exception:
+ sys.stderr.write("Error scraping " + shortname + "\n")
+
+ sys.stdout.flush()
+ sys.stdout.write('Done scraping feeds.\n')
+