aboutsummaryrefslogtreecommitdiffstats
path: root/piccast/scrape_helpers.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@robocracy.org>2011-05-11 22:49:11 -0400
committerbnewbold <bnewbold@robocracy.org>2011-05-11 22:49:11 -0400
commit9f4c07024e685235cb10e7f6e0d9a9090c771532 (patch)
tree8af75034f0d7d22d7b761f2cf2b2e9ab741c323a /piccast/scrape_helpers.py
parent5fbd142dddd6b364d6d22a1c027057aa5d9d9e6e (diff)
downloadpiccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.tar.gz
piccast-9f4c07024e685235cb10e7f6e0d9a9090c771532.zip
WIP: new scraper stuff, should be a branch
Diffstat (limited to 'piccast/scrape_helpers.py')
-rw-r--r--piccast/scrape_helpers.py155
1 files changed, 155 insertions, 0 deletions
diff --git a/piccast/scrape_helpers.py b/piccast/scrape_helpers.py
new file mode 100644
index 0000000..b5d711b
--- /dev/null
+++ b/piccast/scrape_helpers.py
@@ -0,0 +1,155 @@
+import re
+import feedparser
+import urllib
+from datetime import *
+from BeautifulSoup import BeautifulSoup
+import json
+import sys
+
+# this is a django thing
+from feeds.models import *
+
+# ---------------------------- Little Helpers -------------------------------
+def strip_parens(s):
+ return s.split("(")[0]
+
+# ---------------------------- Fetchers -------------------------------
+def fetch_rss(uri):
+ """URI can actually be a file, stream, or string as well as a URL.
+ """
+ print "Fetching " + uri
+ ret = feedparser.parse(uri)
+ if ret.has_key("bozo_exception"):
+ raise Exception("problem parsing in RSS? uri=" + uri)
+ return ret
+
+def fetch_html(uri, raw=None):
+ if raw:
+ return BeautifulSoup(raw)
+ else:
+ print "Fetching " + uri
+ return BeautifulSoup(urllib.urlopen(uri))
+
+def fetch_json(uri):
+ print "Fetching " + uri
+ return json.load(urllib.urlopen(uri))
+
+def test_fetching(remotes=False):
+ # test fetching local rss
+ local_rss = fetch_rss("../example_feed_data/acidcow_rss.xml")
+ print str(local_rss.feed.description)
+ # test fetching local/inline html
+ local_html = fetch_html("../example_feed_data/acidcow_page.html")
+ print "Local has title: " + str(local_html.html.head.title)
+ inline_html = fetch_html("", raw ="""
+ <html><head></head><body>
+ <div id="some_div"><img src="123.png"></div>
+ </body></html>""")
+ print inline_html.first("img")["src"]
+ # test fetching local/remote json
+ local_json = fetch_json("../example_feed_data/piccast_feeds.json")
+ print "Local json: " + local_json["list"][0]["source_url"]
+ if remotes:
+ remote_json = fetch_json("http://piccastapp.com/json/v0/feeds/")
+ print "Remote json: " + remote_json["list"][1]["source_url"]
+
+# ---------------------------- Data to PicSet -------------------------------
+def sets_from_rss(data,
+ source_url_field = "link",
+ created_field = "date",
+ created_format = ("%a, %d %b %Y %H:%M:%S", -6),
+ title_field = "title",
+ category = None,
+ category_field = "category",
+ description_field = "description",
+ data_field = None,
+ ):
+ """
+ This function takes an RSS feedparser object and returns a list of PicSets.
+ Feed-dependant parameters can be used to select specific RSS elements to
+ look in for various fields. The feed content/description can be saved to
+ the data parameter for each set, which allows this HTML to get pulled out
+ for processing later in the pipeline.
+
+ A base assumption is that each RSS entry corresponds to a PicSet.
+ """
+ if not type(data) == feedparser.FeedParserDict:
+ raise Exception("'data' must be a feedparser object")
+
+ sets = []
+ for entry in data['entries']:
+ s = PicSet()
+ try:
+ s.source_url = entry[source_url_field]
+ # created_format[1] is the length of date/time string to chomp off
+ s.created = \
+ datetime.strptime(entry[created_field][:created_format[1]], \
+ created_format[0])
+ s.title = entry[title_field]
+ if category:
+ s.category = category
+ else:
+ s.category_name = entry[category_field]
+ s.description = entry[description_field]
+ if data_field:
+ s.data = entry[data_field]
+ except KeyError as ke:
+ sys.stderr.write("Missing field while parsing RSS into PicSet: " + str(ke) + " (continuing...)\n")
+ continue
+ sets.append(s)
+ return sets
+
+# sets_from_html(data, params): TODO
+# sets_from_json(data, params): TODO
+
+# ---------------------------- Data to Pics -------------------------------
+
+def pics_from_html_simple(data,
+ find_images = lambda d: d.findAll("img"),
+ title_base = "Untitled Pic",
+ source_url = None,
+ match_src = None,
+ meaningless_titles = [],
+ ):
+ """
+ This function simply looks for well <img> tags, creates a Pic for each and returns
+ a list of these.
+ 'data' should be a BeautifulSoup HTML parsing object; use fetch_html().
+ """
+ pics = []
+ index = 1
+ for i in find_images(data):
+ p = Pic()
+ if match_src and not (i["src"].find(match_src) >= 0):
+ continue
+ p.original_url = i["src"]
+ if i.has_key("width"):
+ p.width = i["width"]
+ if i.has_key("height"):
+ p.height = i["height"]
+ if source_url:
+ p.source_url = source_url
+
+ if i.has_key("title"):
+ p.title = i["title"]
+ elif i.has_key("alt"):
+ p.title = i["alt"]
+ else:
+ p.title = title_base + " #" + str(index)
+ if p.title in list(("", " ",)) + list(meaningless_titles):
+ p.title = title_base + " #" + str(index)
+ pics.append(p)
+ index += 1
+ return pics
+
+# pics_from_html(data, params):
+# TODO, in a more general/careful fashion. eg, scrape captions
+# pics_from_rss(data, params): TODO
+# pics_from_json(data, params): TODO
+
+# ---------------------------- Generics -------------------------------
+# generic_blogger(PicFeed)
+# generic_flickr(PicFeed)
+# def generic_single_rss(pf):
+
+