aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-23 21:08:36 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-23 21:08:36 -0800
commitad9cd8a98aa5dce24d49b5a9c460a653f5e840a3 (patch)
tree54ea1dde0f831265550dbe66a7ccc867a83b80ce
parent4fd244c2b1dd393d5b26607f99777a678a78c781 (diff)
downloadfatcat-ad9cd8a98aa5dce24d49b5a9c460a653f5e840a3.tar.gz
fatcat-ad9cd8a98aa5dce24d49b5a9c460a653f5e840a3.zip
importer bugfixes
-rw-r--r--python/fatcat_tools/importers/crossref.py6
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py8
-rw-r--r--python/fatcat_tools/importers/matched.py8
3 files changed, 14 insertions, 8 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 75132901..00c719f1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -111,7 +111,7 @@ class CrossrefImporter(EntityImporter):
return CROSSREF_TYPE_MAP.get(crossref_type)
def map_container_type(self, crossref_type):
- return CONTAINER_TYPE_MAP.get(release_type)
+ return CONTAINER_TYPE_MAP.get(crossref_type)
def want(self, obj):
if not obj.get('title'):
@@ -238,7 +238,7 @@ class CrossrefImporter(EntityImporter):
if rm.get('DOI'):
extra['doi'] = rm.get('DOI').lower()
# TODO: what fields here? CSL citation stuff
- for k in ('authors', 'editor', 'edition', 'authority', 'version',
+ for k in ('author', 'editor', 'edition', 'authority', 'version',
'genre', 'url', 'event', 'issue', 'volume', 'date',
'accessed_date', 'issued', 'page', 'medium',
'collection_title', 'chapter_number'):
@@ -253,7 +253,7 @@ class CrossrefImporter(EntityImporter):
# doing lookups would be a second import pass
target_release_id=None,
key=key,
- year=clean(year),
+ year=year,
container_name=clean(container_name),
title=clean(rm.get('title')),
locator=clean(rm.get('first-page')),
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 468b0ede..9d95fe0b 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -32,6 +32,7 @@ class GrobidMetadataImporter(EntityImporter):
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
self.default_link_rel = kwargs.get("default_link_rel", "web")
+ self.longtail_oa = kwargs.get("longtail_oa", False)
def want(self, raw_record):
return True
@@ -130,12 +131,13 @@ class GrobidMetadataImporter(EntityImporter):
if obj.get('doi'):
extra['doi'] = obj['doi']
if obj['journal'] and obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
-
- extra['is_longtail_oa'] = True
+ extra['container_name'] = clean(obj['journal']['name'])
# TODO: ISSN/eISSN handling? or just journal name lookup?
+ if self.longtail_oa:
+ extra['longtail_oa'] = True
+
if extra:
extra = dict(grobid=extra)
else:
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 123e3530..2ec6c95d 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -120,6 +120,9 @@ class MatchedImporter(EntityImporter):
if err.status != 404:
raise err
+ if not existing:
+ return True
+
fe.release_ids = list(set(fe.release_ids + existing.release_ids))
if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
# no new release matches *and* there are already existing URLs
@@ -127,13 +130,14 @@ class MatchedImporter(EntityImporter):
return False
# merge the existing into this one and update
- existing.urls = list(set(fe.urls + existing.urls))
+ existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
+ existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
existing.md5 = existing.md5 or fe.md5
existing.sha256 = existing.sha256 or fe.sha256
- self.api.update_file(existing.ident, existing)
+ self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup())
self.counts['update'] += 1
return False