WIP: new scraper stuff, should be a branch

author: bnewbold <bnewbold@robocracy.org> 2011-05-11 22:49:11 -0400
committer: bnewbold <bnewbold@robocracy.org> 2011-05-11 22:49:11 -0400
commit: 9f4c07024e685235cb10e7f6e0d9a9090c771532 (patch)
tree: 8af75034f0d7d22d7b761f2cf2b2e9ab741c323a
parent: 5fbd142dddd6b364d6d22a1c027057aa5d9d9e6e (diff)
download: piccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.tar.gz
piccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.zip
3 files changed, 388 insertions, 0 deletions
diff --git a/piccast/scrape_feeds_new.py b/piccast/scrape_feeds_new.py
new file mode 100644
index 0000000..a28cff2
--- /dev/null
+++ b/piccast/scrape_feeds_new.py
@@ -0,0 +1,7 @@
+
+from scrapers import *
+
+
+# save_set()
+
+
diff --git a/piccast/scrape_helpers.py b/piccast/scrape_helpers.py
new file mode 100644
index 0000000..b5d711b
--- /dev/null
+++ b/piccast/scrape_helpers.py
@@ -0,0 +1,155 @@
+import re
+import feedparser
+import urllib
+from datetime import *
+from BeautifulSoup import BeautifulSoup
+import json
+import sys
+
+# this is a django thing
+from feeds.models import *
+
+# ---------------------------- Little Helpers -------------------------------
+def strip_parens(s):
+    return s.split("(")[0]
+
+# ---------------------------- Fetchers -------------------------------
+def fetch_rss(uri):
+    """URI can actually be a file, stream, or string as well as a URL.
+    """
+    print "Fetching " + uri
+    ret = feedparser.parse(uri)
+    if ret.has_key("bozo_exception"):
+        raise Exception("problem parsing in RSS? uri=" + uri)
+    return ret
+
+def fetch_html(uri, raw=None):
+    if raw:
+        return BeautifulSoup(raw)
+    else:
+        print "Fetching " + uri
+        return BeautifulSoup(urllib.urlopen(uri))
+
+def fetch_json(uri):
+    print "Fetching " + uri
+    return json.load(urllib.urlopen(uri))
+
+def test_fetching(remotes=False):
+    # test fetching local rss
+    local_rss = fetch_rss("../example_feed_data/acidcow_rss.xml")
+    print str(local_rss.feed.description)
+    # test fetching local/inline html
+    local_html = fetch_html("../example_feed_data/acidcow_page.html")
+    print "Local has title: " + str(local_html.html.head.title)
+    inline_html = fetch_html("", raw ="""
+        <html><head></head><body>
+        <div id="some_div"><img src="123.png"></div>
+        </body></html>""")
+    print inline_html.first("img")["src"]
+    # test fetching local/remote json
+    local_json = fetch_json("../example_feed_data/piccast_feeds.json")
+    print "Local json: " + local_json["list"][0]["source_url"]
+    if remotes:
+        remote_json = fetch_json("http://piccastapp.com/json/v0/feeds/")
+        print "Remote json: " + remote_json["list"][1]["source_url"]
+
+# ---------------------------- Data to PicSet -------------------------------
+def sets_from_rss(data, 
+        source_url_field = "link",
+        created_field = "date",
+        created_format = ("%a, %d %b %Y %H:%M:%S", -6),
+        title_field = "title",
+        category = None,
+        category_field = "category",
+        description_field = "description",
+        data_field = None,
+        ):
+    """
+    This function takes an RSS feedparser object and returns a list of PicSets.
+    Feed-dependant parameters can be used to select specific RSS elements to
+    look in for various fields. The feed content/description can be saved to
+    the data parameter for each set, which allows this HTML to get pulled out
+    for processing later in the pipeline. 
+    
+    A base assumption is that each RSS entry corresponds to a PicSet. 
+    """
+    if not type(data) == feedparser.FeedParserDict:
+        raise Exception("'data' must be a feedparser object")
+    
+    sets = []
+    for entry in data['entries']:
+        s = PicSet()
+        try:
+            s.source_url = entry[source_url_field]
+            # created_format[1] is the length of date/time string to chomp off
+            s.created = \
+                datetime.strptime(entry[created_field][:created_format[1]], \
+                created_format[0])
+            s.title  = entry[title_field]
+            if category:
+                s.category = category
+            else:
+                s.category_name = entry[category_field]
+            s.description = entry[description_field]
+            if data_field:
+                s.data = entry[data_field]
+        except KeyError as ke:
+            sys.stderr.write("Missing field while parsing RSS into PicSet: " + str(ke) + " (continuing...)\n")
+            continue
+        sets.append(s)
+    return sets
+
+# sets_from_html(data, params): TODO
+# sets_from_json(data, params): TODO
+
+# ---------------------------- Data to Pics -------------------------------
+
+def pics_from_html_simple(data, 
+        find_images = lambda d: d.findAll("img"),
+        title_base = "Untitled Pic",
+        source_url = None,
+        match_src = None,
+        meaningless_titles = [],
+        ):
+    """
+    This function simply looks for well <img> tags, creates a Pic for each and returns
+    a list of these. 
+    'data' should be a BeautifulSoup HTML parsing object; use fetch_html().
+    """
+    pics = []
+    index = 1
+    for i in find_images(data):
+        p = Pic()
+        if match_src and not (i["src"].find(match_src) >= 0):
+            continue
+        p.original_url = i["src"]
+        if i.has_key("width"):
+            p.width = i["width"]
+        if i.has_key("height"):
+            p.height = i["height"]
+        if source_url:
+            p.source_url = source_url
+
+        if i.has_key("title"):
+            p.title = i["title"]
+        elif i.has_key("alt"):
+            p.title = i["alt"]
+        else:
+            p.title = title_base + " #" + str(index)
+        if p.title in list(("", " ",)) + list(meaningless_titles):
+            p.title = title_base + " #" + str(index)
+        pics.append(p)
+        index += 1
+    return pics
+
+# pics_from_html(data, params): 
+#   TODO, in a more general/careful fashion. eg, scrape captions
+# pics_from_rss(data, params): TODO
+# pics_from_json(data, params): TODO
+
+# ---------------------------- Generics -------------------------------
+# generic_blogger(PicFeed)
+# generic_flickr(PicFeed)
+# def generic_single_rss(pf):
+
+
diff --git a/piccast/scrapers.py b/piccast/scrapers.py
new file mode 100644
index 0000000..933c54c
--- /dev/null
+++ b/piccast/scrapers.py
@@ -0,0 +1,226 @@
+
+# Pulls in helpers as well as utility libraries
+from scrape_helpers import *
+
+# Pulls in the PicCast model objects (PicFeed, PicSet, and Pic)
+from feeds.models import *
+
+from django.utils.html import strip_tags
+
+"""
+This file contains a set of scraper functions, one for each PicCast feed.
+Helper functions are defined in scrape_helpers.py; when run as a django command
+this file is included from scrape_feeds.py. 
+
+The general pattern for each scraper is shown below. Note that neither pics nor
+sets are saved by this function::
+
+    def scrape_examplefeed(PicFeed):
+        sets_data = fetch_data(PicFeed.uri)
+        sets = sets_from_data(sets_data)
+        filter(sets)
+        foreach(sets):
+            set = modifcations(set)
+            pic_data = fetch_data(set)
+            pics = pics_from_data(pic_data)
+            filter(pics)
+            set.pics = pics
+            good_sets.append(set)
+        return (good_sets)
+
+It is difficult to generalize too much further because there are several source
+data types for both feeds of PicSets and the list of pictures themselves. For
+example, some of the known patterns which need to be accomodated are: 
+
+    rss -> sets, foreach inline html -> pics
+    rss -> sets, foreach html -> pics
+    json -> sets, foreach rss -> pics
+    json -> sets, foreach json -> pics
+    html -> sets, foreach html -> pics
+
+To add a new scraper, first add the PicFeed to the local database, setting
+is_active to True. Then create a scrape_<shortname>() method (copy one of the
+below) and modify the parameters and dataflow to suit that PicFeed. Test by
+running a django shell ("./manage.py shell"), import the scrapers ("run
+scrapers.py"), and then run a test for that one feed
+("test_scrapers(name='shortname')"). You'll need to download some test
+RSS/HTML/whatever to test with and put it in ../example_feed_data/. 
+
+"""
+
+def save_sets(sets):
+    print "Saving " + str(len(sets)) + " new PicSets..."
+    for s in sets:
+        print " - " + str(s) + " with " + str(len(s.pics)) + " pics"
+        s.save()
+        for p in s.pics:
+            p.set = s
+            p.save()
+
+# ---------------------- Real scrapers go here  ---------------------
+def scrape_acidcow(pf, testing = False):
+    if testing:
+        pf.rssfeed_url = "../example_feed_data/acidcow_rss.xml"
+        testing_uri = "../example_feed_data/acidcow_page.html"
+    sets_data = fetch_rss(pf.rssfeed_url)
+    sets = sets_from_rss(sets_data,
+        source_url_field = "guid",
+        )
+    existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+    sets = filter(lambda s: s.source_url not in existing_urls, sets)
+    sets = filter(lambda s: s.category_name in 
+        ("Pics", "Picdump", "Celebs", "Girls", "Cars"), sets) 
+    good_sets = []
+    for s in sets:
+        if testing:
+            s.source_url = testing_uri
+        if(len(s.description) > 0):
+            s.description = strip_tags(s.description)
+            if(s.description.startswith("Similar posts:")):
+                s.description = None
+        full_title = s.title
+        s.title = strip_parens(s.title)
+        s.category = Category.objects.get_or_create(name=s.category_name)[0]
+        print s
+        pic_data = fetch_html(s.source_url)
+        pics = pics_from_html_simple(pic_data,
+            match_src = "http://acidcow.com/pics/", 
+            source_url = s.source_url,
+            meaningless_titles = [full_title, ],
+            )
+        #filter(pics,)
+        if(len(pics) < 3):
+            continue
+        s.pics = pics
+        good_sets.append(s)
+    if testing:
+        return good_sets
+    else:
+        save_sets(good_sets)
+
+def scrape_butdoesitfloat(pf, testing = False):
+    if testing:
+        pf.rssfeed_url = "../example_feed_data/butdoesitfloat_rss.xml"
+    sets_data = fetch_rss(pf.rssfeed_url)
+    sets = sets_from_rss(sets_data,
+        source_url_field = "comments",
+        category = Category.objects.get_or_create(name="Art")[0]
+        )
+    existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+    sets = filter(lambda s: s.source_url not in existing_urls, sets)
+    good_sets = []
+    for s in sets:
+        pic_data = fetch_html("", raw=s.description)
+        if(len(s.description) > 0):
+            s.description = s.description.split("<img")[0]
+        pics = pics_from_html_simple(pic_data,)
+        #filter(pics,)
+        if(len(pics) < 3):
+            continue
+        s.pics = pics
+        good_sets.append(s)
+    if testing:
+        return good_sets
+    else:
+        save_sets(good_sets)
+
+def scrape_nzinzi(pf, testing = False):
+    if testing:
+        pf.rssfeed_url = "../example_feed_data/nzinzi_rss.xml"
+    sets_data = fetch_rss(pf.rssfeed_url)
+    sets = sets_from_rss(sets_data,
+            category = Category.objects.get_or_create(name="Art")[0],
+            created_format = ("%Y-%m-%dT%H:%M:%S", -10),
+        )
+    existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+    sets = filter(lambda s: s.source_url not in existing_urls, sets)
+    good_sets = []
+    for s in sets:
+        pic_data = fetch_html("", raw=s.description)
+        s.description = ""
+        pics = pics_from_html_simple(pic_data,
+            match_src = "http://ntamak.free.fr/", 
+            )
+        #filter(pics,)
+        if(len(pics) < 3):
+            continue
+        s.pics = pics
+        good_sets.append(s)
+    if testing:
+        return good_sets
+    else:
+        save_sets(good_sets)
+
+def scrape_vectortut(pf, testing = False):
+    if testing:
+        pf.rssfeed_url = "../example_feed_data/vectortut_rss.xml"
+    sets_data = fetch_rss(pf.rssfeed_url)
+    sets = sets_from_rss(sets_data,
+        )
+    existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+    sets = filter(lambda s: s.source_url not in existing_urls, sets)
+    sets = filter(lambda s: s.category_name in 
+        ("Inspirational",), sets) 
+    good_sets = []
+    for s in sets:
+        pic_data = fetch_html("", raw=s.description)
+        s.description = ""
+        s.category = Category.objects.get_or_create(name=s.category_name)[0]
+        pics = pics_from_html_simple(pic_data,
+            match_src = ".cloudfront.net/", 
+            )
+        #filter(pics,)
+        if(len(pics) < 3):
+            continue
+        s.pics = pics
+        good_sets.append(s)
+    if testing:
+        return good_sets
+    else:
+        save_sets(good_sets)
+
+def scrape_chubbychinese(pf, testing = False):
+    if testing:
+        pf.rssfeed_url = "../example_feed_data/chubbychinese_rss.xml"
+    sets_data = fetch_rss(pf.rssfeed_url)
+    sets = sets_from_rss(sets_data,
+            category = Category.objects.get_or_create(name="Food")[0],
+        )
+    existing_urls = map(lambda s: s.source_url, pf.picset_set.all())
+    sets = filter(lambda s: s.source_url not in existing_urls, sets)
+    sets = filter(lambda s: s.category_name in 
+        ("Inspirational",), sets) 
+    good_sets = []
+    for s in sets:
+        pic_data = fetch_html("", raw=s.description)
+        pics = pics_from_html_simple(pic_data,
+            match_src = "static.flickr.com/", 
+            )
+        #filter(pics,)
+        if(len(pics) < 4):
+            continue
+        s.pics = pics
+        good_sets.append(s)
+    if testing:
+        return good_sets
+    else:
+        save_sets(good_sets)
+
+# ---------------------- Testing routines, not required ---------------------
+def test_scrapers(name=None):
+    for pf in PicFeed.objects.filter(is_active=True):
+        if name and pf.shortname != name:
+            continue
+        print "Testing " + pf.shortname + " scrapper ============================="
+        try:
+            scrape = globals()["scrape_" + pf.shortname]
+        except:
+            print "FAILED, no scrape_" + pf.shortname + " found in globals()"
+            continue
+        s = scrape(pf, testing=True)
+        print s[0].source_url
+        print s[0].title
+        print s[0].category
+        print s[0].pics[0].original_url
+        print s[0].pics[0].title
+
author	bnewbold <bnewbold@robocracy.org>	2011-05-11 22:49:11 -0400
committer	bnewbold <bnewbold@robocracy.org>	2011-05-11 22:49:11 -0400
commit	9f4c07024e685235cb10e7f6e0d9a9090c771532 (patch)
tree	8af75034f0d7d22d7b761f2cf2b2e9ab741c323a
parent	5fbd142dddd6b364d6d22a1c027057aa5d9d9e6e (diff)
download	piccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.tar.gz piccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.zip