aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-15 13:11:52 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-15 13:15:15 -0800
commitbb28a3fc1cc900f2dde31e1dbc492d9661034f41 (patch)
treef037dd3d1bab6cbf08a562dbdd4c09361fe0c030 /python/fatcat_tools/importers
parent9f817c6c70a749f2ac449ab4edfd26c6dd8a7410 (diff)
downloadfatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.tar.gz
fatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.zip
large refactor of python names/paths
- Add __init__.py files for fatcat_tools submodules, and use them in imports - Add a bunch of comments to files. - rename a number of classes and functions to be less verbose
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py7
-rw-r--r--python/fatcat_tools/importers/common.py3
-rw-r--r--python/fatcat_tools/importers/crossref.py12
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py4
-rw-r--r--python/fatcat_tools/importers/issn.py14
-rw-r--r--python/fatcat_tools/importers/matched.py10
-rw-r--r--python/fatcat_tools/importers/orcid.py4
7 files changed, 42 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
new file mode 100644
index 00000000..0f5fafb6
--- /dev/null
+++ b/python/fatcat_tools/importers/__init__.py
@@ -0,0 +1,7 @@
+
+from .common import FatcatImporter
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .grobid_metadata import GrobidMetadataImporter
+from .issn import IssnImporter
+from .matched import MatchedImporter
+from .orcid import OrcidImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index d289171d..9cf92b41 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -16,6 +16,9 @@ def grouper(iterable, n, fillvalue=None):
return itertools.zip_longest(*args, fillvalue=fillvalue)
class FatcatImporter:
+ """
+ Base class for fatcat importers
+ """
def __init__(self, host_url, issn_map_file=None):
conf = fatcat_client.Configuration()
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fe80c2d3..fac8f32b 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -5,9 +5,11 @@ import sqlite3
import datetime
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
+# The docs/guide should be the cannonical home for these mappings; update there
+# first
CROSSREF_TYPE_MAP = {
'book': 'book',
'book-chapter': 'chapter',
@@ -29,8 +31,14 @@ CROSSREF_TYPE_MAP = {
'standard': 'standard',
}
+class CrossrefImporter(FatcatImporter):
+ """
+ Importer for Crossref metadata.
-class FatcatCrossrefImporter(FatcatImporter):
+ Can use a local sqlite3 file for faster "external identifier" lookups
+
+ See https://github.com/CrossRef/rest-api-doc for JSON schema notes
+ """
def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
super().__init__(host_url, issn_map_file)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index dedc9728..ba8a4e6f 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,12 @@ import json
import base64
import datetime
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
MAX_ABSTRACT_BYTES=4096
-class FatcatGrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(FatcatImporter):
def __init__(self, host_url, default_link_rel="web"):
super().__init__(host_url)
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
index ba8492c6..0b0efccb 100644
--- a/python/fatcat_tools/importers/issn.py
+++ b/python/fatcat_tools/importers/issn.py
@@ -3,10 +3,8 @@ import sys
import json
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
-# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
def or_none(s):
if s is None:
@@ -26,7 +24,15 @@ def truthy(s):
else:
return None
-class FatcatIssnImporter(FatcatImporter):
+class IssnImporter(FatcatImporter):
+ """
+ Imports journal metadata ("containers") by ISSN, currently from a custom
+ (data munged) .csv file format
+
+ CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+ ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+ """
def parse_issn_row(self, row):
"""
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 774019c7..732fccbe 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
import sqlite3
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
#row = row.split('\t')
#assert len(row) == 2
@@ -13,8 +13,14 @@ from fatcat_tools.importers.common import FatcatImporter
#print(sha1)
#dois = [d.lower() for d in json.loads(row[1])]
-class FatcatMatchedImporter(FatcatImporter):
+class MatchedImporter(FatcatImporter):
"""
+ Importer for "file to crossref DOI" matches.
+
+ These matches are currently generated by Internet Archive hadoop jobs
+ written in scala (part of the 'sandcrawler' repo/project), but could be
+ generated by other parties as well.
+
Input format is JSON with keys:
- dois (list)
- sha1 (hex)
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 527316dd..9e4767f9 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
import json
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
def value_or_none(e):
if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
return None
return e
-class FatcatOrcidImporter(FatcatImporter):
+class OrcidImporter(FatcatImporter):
def parse_orcid_dict(self, obj):
"""