diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-24 16:42:59 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-24 16:43:33 -0800 | 
| commit | cc2aecf897eb80211fae5b57a07d2f98890dee78 (patch) | |
| tree | 7bf3fcf94b1d9d04f89da3bd5da52d37dcb62ab1 | |
| parent | 5d0e1d5c8f33def3d7e48e0cfdbb4286be6fb5fd (diff) | |
| download | fatcat-cc2aecf897eb80211fae5b57a07d2f98890dee78.tar.gz fatcat-cc2aecf897eb80211fae5b57a07d2f98890dee78.zip | |
refactor make_rel_url
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 60 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 18 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 17 | 
3 files changed, 66 insertions, 29 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 89203a4f..2856b88e 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -47,6 +47,66 @@ def test_clean():      assert clean('<b>a&b</b>') == '<b>a&b</b>'      assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' +DOMAIN_REL_MAP = { +    "archive.org": "repository", +    "arxiv.org": "repository", +    "babel.hathitrust.org": "repository", +    "cds.cern.ch": "repository", +    "citeseerx.ist.psu.edu": "repository", +    "deepblue.lib.umich.edu": "repository", +    "europepmc.org": "repository", +    "hal.inria.fr": "repository", +    "scielo.isciii.es": "repository", +    "www.dtic.mil": "repository", +    "www.jstage.jst.go.jp": "repository", +    "www.jstor.org": "repository", +    "www.ncbi.nlm.nih.gov": "repository", +    "www.scielo.br": "repository", +    "www.scielo.cl": "repository", +    "www.scielo.org.mx": "repository", +    "zenodo.org": "repository", + +    "academic.oup.com": "publisher", +    "cdn.elifesciences.org": "publisher", +    "cell.com": "publisher", +    "dl.acm.org": "publisher", +    "downloads.hindawi.com": "publisher", +    "elifesciences.org": "publisher", +    "iopscience.iop.org": "publisher", +    "journals.plos.org": "publisher", +    "link.springer.com": "publisher", +    "onlinelibrary.wiley.com": "publisher", +    "works.bepress.com": "publisher", +    "www.biomedcentral.com": "publisher", +    "www.cell.com": "publisher", +    "www.nature.com": "publisher", +    "www.pnas.org": "publisher", +    "www.tandfonline.com": "publisher", + +    "www.researchgate.net": "social", +    "academia.edu": "social", + +    "wayback.archive-it.org": "webarchive", +    "web.archive.org": "webarchive", +    "archive.is": "webarchive", +} + +def make_rel_url(raw_url, default_link_rel="web"): +    # this is where we map specific domains to rel types, and also filter out +    # bad domains, invalid URLs, etc +    rel = default_link_rel +    for domain, domain_rel in DOMAIN_REL_MAP.items(): +        if "//{}/".format(domain) in raw_url: +            rel = domain_rel +            break +    return (rel, raw_url) + +def test_make_rel_url(): +    assert make_rel_url("http://example.com/thing.pdf")[0] == "web" +    assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans" +    assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive" +    assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher" +  class EntityImporter:      """      Base class for fatcat entity importers. diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 9d95fe0b..25f9fa89 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,7 +5,7 @@ import json  import base64  import datetime  import fatcat_client -from .common import EntityImporter, clean +from .common import EntityImporter, clean, make_rel_url  MAX_ABSTRACT_BYTES=4096 @@ -157,18 +157,6 @@ class GrobidMetadataImporter(EntityImporter):              extra=extra)          return re -    # TODO: make this a common function somewhere -    def make_url(self, raw): -        rel = self.default_link_rel -        # TODO: this is where we could map specific domains to rel types, -        # and also filter out bad domains, invalid URLs, etc -        if "//archive.org/" in raw or "//arxiv.org/" in raw: -            # TODO: special-case the arxiv.org bulk mirror? -            rel = "repository" -        elif "//web.archive.org/" in raw or "//archive.is/" in raw: -            rel = "webarchive" -        return fatcat_client.FileEntityUrls(url=raw, rel=rel) -      def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size):          sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() @@ -189,9 +177,9 @@ class GrobidMetadataImporter(EntityImporter):              original)          fe.urls.append(              fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) -        original_url = self.make_url(original) +        original_url = make_rel_url(original, default_link_rel=self.default_link_rel)          if original_url is not None: -            fe.urls.append(original_url) +            fe.urls.append(fatcat_client.FileEntityUrls(rel=original_url[0], url=original_url[1]))          return fe diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 2ec6c95d..aca2cc34 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json  import sqlite3  import itertools  import fatcat_client -from .common import EntityImporter, clean +from .common import EntityImporter, clean, make_rel_url  class MatchedImporter(EntityImporter): @@ -43,17 +43,6 @@ class MatchedImporter(EntityImporter):          self.default_link_rel = kwargs.get("default_link_rel", "web")          self.default_mime = kwargs.get("default_mime", None) -    def make_url(self, raw): -        rel = self.default_link_rel -        # TODO: this is where we could map specific domains to rel types, -        # and also filter out bad domains, invalid URLs, etc -        if "//archive.org/" in raw or "//arxiv.org/" in raw: -            # TODO: special-case the arxiv.org bulk mirror? -            rel = "repository" -        elif "//web.archive.org/" in raw or "//archive.is/" in raw: -            rel = "webarchive" -        return (rel, raw) -      def want(self, raw_record):          return True @@ -80,7 +69,7 @@ class MatchedImporter(EntityImporter):          # parse URLs and CDX          urls = set()          for url in obj.get('url', []): -            url = self.make_url(url) +            url = make_rel_url(url, default_link_rel=self.default_link_rel)              if url != None:                  urls.add(url)          for cdx in obj.get('cdx', []): @@ -89,7 +78,7 @@ class MatchedImporter(EntityImporter):                  cdx['dt'],                  original)              urls.add(("webarchive", wayback)) -            url = self.make_url(original) +            url = make_rel_url(original, default_link_rel=self.default_link_rel)              if url != None:                  urls.add(url)          urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls] | 
