piccast/scrape_helpers.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

import re
import feedparser
import urllib
from datetime import *
from BeautifulSoup import BeautifulSoup
import json
import sys

# this is a django thing
from feeds.models import *

# ---------------------------- Little Helpers -------------------------------
def strip_parens(s):
    return s.split("(")[0]

# ---------------------------- Fetchers -------------------------------
def fetch_rss(uri):
    """URI can actually be a file, stream, or string as well as a URL.
    """
    print "Fetching " + uri
    ret = feedparser.parse(uri)
    if ret.has_key("bozo_exception"):
        raise Exception("problem parsing in RSS? uri=" + uri)
    return ret

def fetch_html(uri, raw=None):
    if raw:
        return BeautifulSoup(raw)
    else:
        print "Fetching " + uri
        return BeautifulSoup(urllib.urlopen(uri))

def fetch_json(uri):
    print "Fetching " + uri
    return json.load(urllib.urlopen(uri))

def test_fetching(remotes=False):
    # test fetching local rss
    local_rss = fetch_rss("../example_feed_data/acidcow_rss.xml")
    print str(local_rss.feed.description)
    # test fetching local/inline html
    local_html = fetch_html("../example_feed_data/acidcow_page.html")
    print "Local has title: " + str(local_html.html.head.title)
    inline_html = fetch_html("", raw ="""
        <html><head></head><body>
        <div id="some_div"><img src="123.png"></div>
        </body></html>""")
    print inline_html.first("img")["src"]
    # test fetching local/remote json
    local_json = fetch_json("../example_feed_data/piccast_feeds.json")
    print "Local json: " + local_json["list"][0]["source_url"]
    if remotes:
        remote_json = fetch_json("http://piccastapp.com/json/v0/feeds/")
        print "Remote json: " + remote_json["list"][1]["source_url"]

# ---------------------------- Data to PicSet -------------------------------
def sets_from_rss(data, 
        source_url_field = "link",
        created_field = "date",
        created_format = ("%a, %d %b %Y %H:%M:%S", -6),
        title_field = "title",
        category = None,
        category_field = "category",
        description_field = "description",
        data_field = None,
        ):
    """
    This function takes an RSS feedparser object and returns a list of PicSets.
    Feed-dependant parameters can be used to select specific RSS elements to
    look in for various fields. The feed content/description can be saved to
    the data parameter for each set, which allows this HTML to get pulled out
    for processing later in the pipeline. 
    
    A base assumption is that each RSS entry corresponds to a PicSet. 
    """
    if not type(data) == feedparser.FeedParserDict:
        raise Exception("'data' must be a feedparser object")
    
    sets = []
    for entry in data['entries']:
        s = PicSet()
        try:
            s.source_url = entry[source_url_field]
            # created_format[1] is the length of date/time string to chomp off
            s.created = \
                datetime.strptime(entry[created_field][:created_format[1]], \
                created_format[0])
            s.title  = entry[title_field]
            if category:
                s.category = category
            else:
                s.category_name = entry[category_field]
            s.description = entry[description_field]
            if data_field:
                s.data = entry[data_field]
        except KeyError as ke:
            sys.stderr.write("Missing field while parsing RSS into PicSet: " + str(ke) + " (continuing...)\n")
            continue
        sets.append(s)
    return sets

# sets_from_html(data, params): TODO
# sets_from_json(data, params): TODO

# ---------------------------- Data to Pics -------------------------------

def pics_from_html_simple(data, 
        find_images = lambda d: d.findAll("img"),
        title_base = "Untitled Pic",
        source_url = None,
        match_src = None,
        meaningless_titles = [],
        ):
    """
    This function simply looks for well <img> tags, creates a Pic for each and returns
    a list of these. 
    'data' should be a BeautifulSoup HTML parsing object; use fetch_html().
    """
    pics = []
    index = 1
    for i in find_images(data):
        p = Pic()
        if match_src and not (i["src"].find(match_src) >= 0):
            continue
        p.original_url = i["src"]
        if i.has_key("width"):
            p.width = i["width"]
        if i.has_key("height"):
            p.height = i["height"]
        if source_url:
            p.source_url = source_url

        if i.has_key("title"):
            p.title = i["title"]
        elif i.has_key("alt"):
            p.title = i["alt"]
        else:
            p.title = title_base + " #" + str(index)
        if p.title in list(("", " ",)) + list(meaningless_titles):
            p.title = title_base + " #" + str(index)
        pics.append(p)
        index += 1
    return pics

# pics_from_html(data, params): 
#   TODO, in a more general/careful fashion. eg, scrape captions
# pics_from_rss(data, params): TODO
# pics_from_json(data, params): TODO

# ---------------------------- Generics -------------------------------
# generic_blogger(PicFeed)
# generic_flickr(PicFeed)
# def generic_single_rss(pf):