crude feed scraper

author: bnewbold <bnewbold@robocracy.org> 2011-04-22 02:02:09 -0400
committer: bnewbold <bnewbold@robocracy.org> 2011-04-22 02:02:09 -0400
commit: 6413abbd68e6a441fe0df4fe93e9e78e375636dc (patch)
tree: 4baa22d55982aa170698690da809e7ef9480e2a8 /piccast/feeds
parent: 08d8e7ebf5845ea8e7dd2e0eaaf61b3f9a8ea1ce (diff)
download: piccast-6413abbd68e6a441fe0df4fe93e9e78e375636dc.tar.gz
piccast-6413abbd68e6a441fe0df4fe93e9e78e375636dc.zip
4 files changed, 207 insertions, 1 deletions
diff --git a/piccast/feeds/management/__init__.py b/piccast/feeds/management/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/piccast/feeds/management/__init__.py
diff --git a/piccast/feeds/management/commands/__init__.py b/piccast/feeds/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/piccast/feeds/management/commands/__init__.py
diff --git a/piccast/feeds/management/commands/scrape_feeds.py b/piccast/feeds/management/commands/scrape_feeds.py
new file mode 100644
index 0000000..1de9e4c
--- /dev/null
+++ b/piccast/feeds/management/commands/scrape_feeds.py
@@ -0,0 +1,205 @@
+import re
+import feedparser
+import urllib
+import sys
+from datetime import *
+
+from django.core.management.base import BaseCommand, CommandError
+from feeds.models import *
+
+# see Command definition at the end
+
+# this mechanism isn't robust yet b/c any small sets get parsed repeatedly
+MIN_SET_SIZE = 1   # Need to find at least this many images for each set
+
+###############################################################################
+def scrape_pics_from_html(pset, html):
+    
+    if(type(pset) != PicSet):
+        raise("Type error, expected PicSet")
+
+    # assumptions: one <img> per line, ordering, etc
+    mset = re.finditer('<img src="(http://\S+)" \S* width="(\d+)" height="(\d+)"', html)
+
+    pics = list()
+    index = 0
+    for m in mset:
+        index += 1
+        p = Pic()
+        #print m.group(1)
+        #print m.group(2)
+        # This one line strips any paren content; disabled because it's just the
+        # set title repeated
+        #p.title = m.group(2).split(" (")[0]
+        p.title = "#" + str(index)
+        p.set = pset
+        # empty string isn't the same as leaving this null
+        #p.caption = ""
+        p.original_url = m.group(1)
+        p.source_url = pset.source_url
+        p.width = m.group(2)
+        p.height = m.group(3)
+        pics.append(p)
+
+    if(len(pics) < MIN_SET_SIZE):
+        print "Didn't find enough pictures to save this set (found " + \
+            str(len(pics)) + ", MIN_SET_SIZE=" + str(MIN_SET_SIZE) + ")"
+        pset.delete()
+        return
+
+    # TODO: oh boy, serial, this is a horrible way to do it! 
+    for p in pics:
+        p.save()
+
+    # add a thumbnail image for the picset (TODO: resize thumbnail?)
+    if not pset.image:
+        pset.image = pics[0]
+        pset.save()
+
+    print "Found and saved " + str(len(pics)) + " new Pics"
+
+###############################################################################
+def scrape_pics_acidcow(pset):
+    
+    if(type(pset) != PicSet):
+        raise("Type error, expected PicSet")
+
+    #print "DEBUG"
+    #html = open("./19340-evolution-16-pics.html")
+    print "Scraping from " + pset.source_url + "..."
+    html = urllib.urlopen(pset.source_url)
+
+    mset = re.finditer('src="(http://acidcow.com/pics/\d+/\S+)" alt=\'([^\.]+)\' title', html.read())
+
+    pics = list()
+    index = 0
+    for m in mset:
+        index += 1
+        p = Pic()
+        #print m.group(1)
+        #print m.group(2)
+        # This oneline strips any paren content; disabled because it's just the
+        # set title repeated
+        #p.title = m.group(2).split(" (")[0]
+        p.title = "#" + str(index)
+        p.set = pset
+        p.caption = ""
+        p.original_url = m.group(1)
+        p.source_url = pset.source_url
+        pics.append(p)
+
+    # close the connection
+    html.close()
+
+    if(len(pics) < MIN_SET_SIZE):
+        print "Didn't find enough pictures to save this set (found " + \
+            str(len(pics)) + ", MIN_SET_SIZE=" + str(MIN_SET_SIZE) + ")"
+        pset.delete()
+        return
+
+    # TODO: oh boy, serial, this is a horrible way to do it! 
+    for p in pics:
+        p.save()
+
+    # add a thumbnail image for the picset (TODO: resize thumbnail?)
+    if not pset.image:
+        pset.image = pics[0]
+        pset.save()
+
+    print "Found and saved " + str(len(pics)) + " new Pics"
+
+
+###############################################################################
+def scrape_feed(feed_shortname):
+
+    try:
+        feed = PicFeed.objects.get(shortname=feed_shortname)
+    except Exception as e:
+        sys.stderr.write("Error finding feed by shortname: " + \
+            feed_shortname + "\n")
+        raise e
+
+    print "Fetching feed for " + feed_shortname + ": " + feed.rssfeed_url
+    f = feedparser.parse(feed.rssfeed_url)
+    #print "DEBUG"
+    #f = feedparser.parse("./acidcow_com?format=xml")
+    #f = feedparser.parse("./ButDoesItFloat?format=xml")
+
+    psets = list()
+    for e in f['entries']:
+        pset = dict()
+        if(feed_shortname == "acidcow"):
+            pset['source_url'] = e.guid
+        elif(feed_shortname == "butdoesitfloat"):
+            pset['source_url'] = e.guid
+        else:
+            pset['source_url'] = e.link
+        pset['created'] = e.date
+        pset['title'] = e.title
+        pset['category'] = e.category
+        pset['description'] = e.description
+        psets.append(pset)
+
+    # clean up parser
+    del f
+    
+    if(feed_shortname == u"acidcow"):
+        psets = filter(lambda s: s['category'] in (u'Pics', u'Picdump'), psets)
+
+    new_psets = filter(lambda s: \
+        0 == len(PicSet.objects.filter(source_url=s['source_url'])), \
+        psets)
+
+    for pset in new_psets:
+        print "Parsing PicSet: " + pset['title'] + " (" + pset['category'] + ")"
+        p = PicSet()
+        p.feed = feed
+        p.source_url = pset['source_url']
+        p.created = datetime.strptime(pset['created'][:-6], \
+            "%a, %d %b %Y %H:%M:%S")
+        # This oneline strips any paren content
+        p.title = pset['title'].split(" (")[0]
+        if(feed_shortname == u"butdoesitfloat"):
+            # Category is a list for this site
+            p.category = Category.objects.get_or_create(name="Art")[0]
+        else:
+            p.category = Category.objects.get_or_create(name=pset['category'])[0]
+
+        # Ok, this is where we split out and do custom, per-site processing
+        if(feed_shortname == u"acidcow"):
+            p.save()
+            print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
+            scrape_pics_acidcow(p)
+        elif(feed_shortname == u"butdoesitfloat"):
+            # the descriptions here are usually important
+            # TODO: should comment URL instead of link url (which bounces)?
+            p.description = pset['description'].split("<img")[0]
+            p.save()
+            print "Great, saved: " + p.title + " (id=" + str(p.id) + ")"
+            scrape_pics_from_html(p, pset['description'])
+        else:
+            print "ERROR: unknown PicFeed: " + pset['feed']
+        #parse_page(s['source_url'], httplib.HTTPConnection)
+        break # DEBUG
+
+
+###############################################################################
+
+class Command(BaseCommand):
+    args = '<feed_shortname feed_shortname ...>'
+    help = 'Fetches RSS, parses, and possibly scrapes HTML for the given feeds/sources'
+
+    def handle(self, *args, **options):
+        if len(args) < 1:
+            sys.stderr.write("Need to specify at least one <feed_shortname>...\n")
+            return
+
+        for shortname in args:
+            try:
+                scrape_feed(shortname)
+            except Exception:
+                sys.stderr.write("Error scraping " + shortname + "\n")
+
+        sys.stdout.flush()
+        sys.stdout.write('Done scraping feeds.\n')
+
diff --git a/piccast/feeds/views.py b/piccast/feeds/views.py
index de335d6..daa4a28 100644
--- a/piccast/feeds/views.py
+++ b/piccast/feeds/views.py
@@ -34,7 +34,6 @@ def sets_by_feedslug(request, req_feedslug):
     json_serializer.serialize(sets)
     return HttpResponse(json_serializer.getvalue(), mimetype="application/json")
 
-
 def pics_by_set(request, req_picset):
     # TODO: validate req_picset
     pics = Pic.objects.filter(set=req_picset)
@@ -42,3 +41,5 @@ def pics_by_set(request, req_picset):
     json_serializer.serialize(pics)
     return HttpResponse(json_serializer.getvalue(), mimetype="application/json")
 
+def 
+
author	bnewbold <bnewbold@robocracy.org>	2011-04-22 02:02:09 -0400
committer	bnewbold <bnewbold@robocracy.org>	2011-04-22 02:02:09 -0400
commit	6413abbd68e6a441fe0df4fe93e9e78e375636dc (patch)
tree	4baa22d55982aa170698690da809e7ef9480e2a8 /piccast/feeds
parent	08d8e7ebf5845ea8e7dd2e0eaaf61b3f9a8ea1ce (diff)
download	piccast-6413abbd68e6a441fe0df4fe93e9e78e375636dc.tar.gz piccast-6413abbd68e6a441fe0df4fe93e9e78e375636dc.zip