summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-24 16:42:59 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-24 16:43:33 -0800
commitcc2aecf897eb80211fae5b57a07d2f98890dee78 (patch)
tree7bf3fcf94b1d9d04f89da3bd5da52d37dcb62ab1 /python/fatcat_tools/importers/common.py
parent5d0e1d5c8f33def3d7e48e0cfdbb4286be6fb5fd (diff)
downloadfatcat-cc2aecf897eb80211fae5b57a07d2f98890dee78.tar.gz
fatcat-cc2aecf897eb80211fae5b57a07d2f98890dee78.zip
refactor make_rel_url
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 89203a4f..2856b88e 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -47,6 +47,66 @@ def test_clean():
assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+DOMAIN_REL_MAP = {
+ "archive.org": "repository",
+ "arxiv.org": "repository",
+ "babel.hathitrust.org": "repository",
+ "cds.cern.ch": "repository",
+ "citeseerx.ist.psu.edu": "repository",
+ "deepblue.lib.umich.edu": "repository",
+ "europepmc.org": "repository",
+ "hal.inria.fr": "repository",
+ "scielo.isciii.es": "repository",
+ "www.dtic.mil": "repository",
+ "www.jstage.jst.go.jp": "repository",
+ "www.jstor.org": "repository",
+ "www.ncbi.nlm.nih.gov": "repository",
+ "www.scielo.br": "repository",
+ "www.scielo.cl": "repository",
+ "www.scielo.org.mx": "repository",
+ "zenodo.org": "repository",
+
+ "academic.oup.com": "publisher",
+ "cdn.elifesciences.org": "publisher",
+ "cell.com": "publisher",
+ "dl.acm.org": "publisher",
+ "downloads.hindawi.com": "publisher",
+ "elifesciences.org": "publisher",
+ "iopscience.iop.org": "publisher",
+ "journals.plos.org": "publisher",
+ "link.springer.com": "publisher",
+ "onlinelibrary.wiley.com": "publisher",
+ "works.bepress.com": "publisher",
+ "www.biomedcentral.com": "publisher",
+ "www.cell.com": "publisher",
+ "www.nature.com": "publisher",
+ "www.pnas.org": "publisher",
+ "www.tandfonline.com": "publisher",
+
+ "www.researchgate.net": "social",
+ "academia.edu": "social",
+
+ "wayback.archive-it.org": "webarchive",
+ "web.archive.org": "webarchive",
+ "archive.is": "webarchive",
+}
+
+def make_rel_url(raw_url, default_link_rel="web"):
+ # this is where we map specific domains to rel types, and also filter out
+ # bad domains, invalid URLs, etc
+ rel = default_link_rel
+ for domain, domain_rel in DOMAIN_REL_MAP.items():
+ if "//{}/".format(domain) in raw_url:
+ rel = domain_rel
+ break
+ return (rel, raw_url)
+
+def test_make_rel_url():
+ assert make_rel_url("http://example.com/thing.pdf")[0] == "web"
+ assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans"
+ assert make_rel_url("https://web.archive.org/web/*/http://example.com/thing.pdf")[0] == "webarchive"
+ assert make_rel_url("http://cell.com/thing.pdf")[0] == "publisher"
+
class EntityImporter:
"""
Base class for fatcat entity importers.