aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CONTRIBUTORS.md17
-rwxr-xr-xpython/fatcat_harvest.py2
-rwxr-xr-xpython/fatcat_import.py4
-rwxr-xr-xpython/fatcat_ingest.py2
-rw-r--r--python/fatcat_tools/harvest/harvest_common.py4
-rw-r--r--python/fatcat_tools/importers/common.py2
-rw-r--r--python/fatcat_tools/importers/crossref.py4
-rw-r--r--python/fatcat_tools/importers/datacite.py8
-rw-r--r--python/fatcat_tools/transforms/csl.py18
-rw-r--r--python/fatcat_tools/workers/changelog.py26
-rwxr-xr-xpython/fatcat_transform.py2
-rw-r--r--python/fatcat_web/__init__.py2
-rw-r--r--python/fatcat_web/entity_helpers.py23
-rw-r--r--python/fatcat_web/search.py2
-rw-r--r--python/fatcat_web/templates/release_view.html3
-rw-r--r--python/tests/transform_csl.py20
-rw-r--r--python/tests/web_citation_csl.py46
-rw-r--r--python/tests/web_entity_views.py4
-rw-r--r--python/tests/web_search.py4
19 files changed, 147 insertions, 46 deletions
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 414d1a96..8fe1a830 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,3 +1,16 @@
+Fatcat Contributors (alphabetically sorted)
+==============================================
-Special Thanks to Asheesh Laroia, who reviewed and gave excellent feedback on
-the fatcat schemas, structure, and python library in November, 2018.
+* [Edward Betts](https://github.com/EdwardBetts)
+
+* [Martin Czygan](https://github.com/miku)
+
+* [Asheesh Laroia](http://www.asheesh.org/)
+
+ * reviewed and gave excellent feedback on the fatcat schemas, structure, and python library in November, 2018.
+
+* [Bryan Newbold](https://bnewbold.net)
+
+* [Ellen Spertus](http://www.spertus.com/ellen/)
+
+ * IA Volunteer Summer 2018, wrote a Hadoop/Scala/Scalding matching job which resulted in millions of fatcat fulltext matches
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index 151b025d..a45b44f8 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -83,7 +83,7 @@ def main():
help="Kafka topic namespace to use (eg, prod, qa, dev)")
parser.add_argument('--start-date',
default=None, type=mkdate,
- help="begining of harvest period")
+ help="beginning of harvest period")
parser.add_argument('--end-date',
default=None, type=mkdate,
help="end of harvest period")
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index e1e06653..331cf791 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -416,7 +416,7 @@ def main():
help="whether postproc_status column must be '200'")
sub_arabesque_match.add_argument('--extid-type',
default="doi",
- help="identifer type in the database (eg, 'doi', 'pmcid'")
+ help="identifier type in the database (eg, 'doi', 'pmcid'")
sub_arabesque_match.add_argument('--crawl-id',
help="crawl ID (optionally included in editgroup metadata)")
sub_arabesque_match.add_argument('--default-link-rel',
@@ -424,7 +424,7 @@ def main():
help="default URL rel for matches (eg, 'publisher', 'web')")
sub_ingest_file = subparsers.add_parser('ingest-file-results',
- help="add/update flie entities linked to releases based on sandcrawler ingest results")
+ help="add/update file entities linked to releases based on sandcrawler ingest results")
sub_ingest_file.set_defaults(
func=run_ingest_file,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 9ba95015..c6f27ad3 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -185,7 +185,7 @@ def main():
help="list of Kafka brokers (host/port) to use")
parser.add_argument('--elasticsearch-endpoint',
default="https://search.fatcat.wiki",
- help="elasticsearch API. internal endpoint prefered, but public is default")
+ help="elasticsearch API. internal endpoint preferred, but public is default")
parser.add_argument('--env',
default="dev",
help="Kafka topic namespace to use (eg, prod, qa, dev)")
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index 310366bd..5f7aa084 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -133,7 +133,7 @@ class HarvestState:
def fail_fast(err, msg):
if err:
raise KafkaException(err)
- print("Commiting status to Kafka: {}".format(kafka_topic), file=sys.stderr)
+ print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr)
producer_conf = kafka_config.copy()
producer_conf.update({
'delivery.report.only.error': True,
@@ -164,7 +164,7 @@ class HarvestState:
raise KafkaException(err)
conf = kafka_config.copy()
conf.update({
- 'group.id': 'dummy_init_group', # should never be commited
+ 'group.id': 'dummy_init_group', # should never be committed
'enable.auto.commit': False,
'auto.offset.reset': 'earliest',
'session.timeout.ms': 10000,
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index c000ad62..da611ecb 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -816,7 +816,7 @@ class KafkaJsonPusher(RecordPusher):
while True:
# Note: this is batch-oriented, because underlying importer is
# often batch-oriented, but this doesn't confirm that entire batch
- # has been pushed to fatcat before commiting offset. Eg, consider
+ # has been pushed to fatcat before committing offset. Eg, consider
# case where there there is one update and thousands of creates;
# update would be lingering in importer, and if importer crashed
# never created.
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index bd070ef1..9617299c 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -9,7 +9,7 @@ import fatcat_openapi_client
from .common import EntityImporter, clean
-# The docs/guide should be the cannonical home for these mappings; update there
+# The docs/guide should be the canonical home for these mappings; update there
# first
# Can get a list of Crossref types (with counts) via API:
# https://api.crossref.org/works?rows=0&facet=type-name:*
@@ -188,7 +188,7 @@ class CrossrefImporter(EntityImporter):
self.counts['skip-release-type'] += 1
return None
- # Do require the 'title' keys to exsit, as release entities do
+ # Do require the 'title' keys to exist, as release entities do
if (not 'title' in obj) or (not obj['title']):
self.counts['skip-blank-title'] += 1
return None
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 5b736787..81f00876 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -3,7 +3,7 @@ Prototype importer for datacite.org data.
Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51
-Datacite being an aggregator, the data is heterogenous and exposes a couple of
+Datacite being an aggregator, the data is heterogeneous and exposes a couple of
problems in content and structure. A few fields have their own parsing
functions (parse_datacite_...), which may help testing.
"""
@@ -36,7 +36,7 @@ CONTAINER_TYPE_MAP = {
'Book Series': 'book-series',
}
-# The docs/guide should be the cannonical home for these mappings; update there
+# The docs/guide should be the canonical home for these mappings; update there
# first. Map various datacite type types to CSL-ish types. None means TODO or
# remove.
DATACITE_TYPE_MAP = {
@@ -228,7 +228,7 @@ class DataciteImporter(EntityImporter):
def lookup_ext_ids(self, doi):
"""
- Return dictionary of identifiers refering to the same things as the given DOI.
+ Return dictionary of identifiers referring to the same things as the given DOI.
"""
if self.extid_map_db is None:
return dict(core_id=None,
@@ -584,7 +584,7 @@ class DataciteImporter(EntityImporter):
# Include certain relations from relatedIdentifiers. Keeping the
# original structure of data here, which is a list of dicts, with
- # relation type, identifer and identifier type (mostly).
+ # relation type, identifier and identifier type (mostly).
relations = []
for rel in relIds:
if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py
index 7ab94cac..832ad6aa 100644
--- a/python/fatcat_tools/transforms/csl.py
+++ b/python/fatcat_tools/transforms/csl.py
@@ -37,8 +37,9 @@ def release_to_csl(entity):
# Default to "local" (publication-specific) metadata; fall back to
# creator-level
family = contrib.surname or contrib.creator.surname or (contrib.raw_name and contrib.raw_name.split()[-1])
- if not contrib.raw_name:
- raise ValueError("CSL requires some surname (family name)")
+ if not family:
+ # CSL requires some surname (family name)
+ continue
c = dict(
family=family,
given=contrib.given_name or contrib.creator.given_name,
@@ -49,22 +50,27 @@ def release_to_csl(entity):
#static-ordering
literal=contrib.raw_name or contrib.creator.display_name,
#parse-names,
- role=contrib.role,
+ # role must be defined; default to author
+ role=contrib.role or 'author',
)
else:
family = contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1])
- if not contrib.raw_name:
- raise ValueError("CSL requires some surname (family name)")
+ if not family:
+ # CSL requires some surname (family name)
+ continue
c = dict(
family=family,
given=contrib.given_name,
literal=contrib.raw_name,
- role=contrib.role,
+ # role must be defined; default to author
+ role=contrib.role or 'author',
)
for k in list(c.keys()):
if not c[k]:
c.pop(k)
contribs.append(c)
+ if not contribs:
+ raise ValueError("citeproc requires at least one author with a surname")
abstract = None
if entity.abstracts:
abstract = entity.abstracts[0].content
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 5783bbfc..d1e7c2db 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -110,6 +110,32 @@ class EntityUpdatesWorker(FatcatWorker):
# the lancet (often hybrid OA)
"10.1016/s0140-6736",
"10.1016/s2213-2600",
+ # journal of virology
+ "10.1128/jvi.",
+ # FEBS letters
+ "10.1002/1873-3468.",
+ # Journal of Neuroscience
+ "10.1523/jneurosci.",
+ # Chemical and pharmaceutical bulletin
+ "10.1248/cpb.",
+ # Japanese Journal of Radiological Technology
+ "10.6009/jjrt.",
+ # Seibutsu Butsuri
+ "10.2142/biophys.",
+ # Chemical Communications
+ "10.1039/d0cc",
+ # Yakugaku zasshi
+ "10.1248/yakushi.",
+ # bulletin AMS
+ "10.1090/s0002-9904",
+ # Current Biology
+ "10.1016/j.cub.",
+ # Antarctica A Keystone in a Changing World
+ "10.3133/ofr",
+ # Clinical Cancer Research
+ "10.1158/1078-0432.",
+ # Transactions of the Japan Society of Mechanical Engineers
+ "10.1299/kikai",
]
def want_live_ingest(self, release, ingest_request):
diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py
index 23a56109..14595670 100755
--- a/python/fatcat_transform.py
+++ b/python/fatcat_transform.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
-Utility script for doing bulk conversion/tranforms of entity JSON schema to
+Utility script for doing bulk conversion/transforms of entity JSON schema to
other formats
"""
diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py
index 50757858..56a2e020 100644
--- a/python/fatcat_web/__init__.py
+++ b/python/fatcat_web/__init__.py
@@ -61,7 +61,7 @@ else:
print("No privileged token found")
priv_api = None
-# TODO: refactor integration so this doesn't always need to be definied. If
+# TODO: refactor integration so this doesn't always need to be defined. If
# key/secret are empty, library will not init; if init is skipped, get
# undefined errors elsewhere.
mwoauth = MWOAuth(
diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py
index 7a9830f9..7ac0f155 100644
--- a/python/fatcat_web/entity_helpers.py
+++ b/python/fatcat_web/entity_helpers.py
@@ -1,6 +1,6 @@
from flask import abort
-from fatcat_openapi_client.rest import ApiException
+from fatcat_openapi_client.rest import ApiException, ApiValueError
from fatcat_tools.transforms import *
from fatcat_web import app, api
from fatcat_web.search import get_elastic_container_stats, get_elastic_container_random_releases
@@ -78,8 +78,13 @@ def enrich_release_entity(entity):
entity.subtitle = entity.extra['subtitle']
# author list to display; ensure it's sorted by index (any othors with
# index=None go to end of list)
- authors = [c for c in entity.contribs if c.role in ('author', None)]
+ authors = [c for c in entity.contribs if
+ c.role in ('author', None) and
+ (c.surname or c.raw_name or (c.creator and c.creator.surname))
+ ]
entity._authors = sorted(authors, key=lambda c: (c.index == None and 99999999) or c.index)
+ # need authors, title for citeproc to work
+ entity._can_citeproc = bool(entity._authors) and bool(entity.title)
if entity.abstracts:
# hack to show plain text instead of latex abstracts
if 'latex' in entity.abstracts[0].mimetype:
@@ -122,6 +127,8 @@ def generic_get_entity(entity_type, ident):
raise NotImplementedError
except ApiException as ae:
abort(ae.status)
+ except ApiValueError:
+ abort(400)
def generic_get_entity_revision(entity_type, revision_id):
try:
@@ -143,6 +150,8 @@ def generic_get_entity_revision(entity_type, revision_id):
raise NotImplementedError
except ApiException as ae:
abort(ae.status)
+ except ApiValueError:
+ abort(400)
def generic_get_editgroup_entity(editgroup, entity_type, ident):
if entity_type == 'container':
@@ -168,9 +177,15 @@ def generic_get_editgroup_entity(editgroup, entity_type, ident):
edit = e
break
if not revision_id:
- # couldn't find relevent edit in this editgroup
+ # couldn't find relevant edit in this editgroup
abort(404)
- entity = generic_get_entity_revision(entity_type, revision_id)
+ try:
+ entity = generic_get_entity_revision(entity_type, revision_id)
+ except ApiException as ae:
+ abort(ae.status)
+ except ApiValueError:
+ abort(400)
+
entity.ident = ident
return entity, edit
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 6b2b9cc1..c1246d22 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -299,7 +299,7 @@ def get_elastic_container_histogram(ident):
"""
Fetches a stacked histogram of
- Filters to the past 500 years (at most), or about 1000 vaules.
+ Filters to the past 500 years (at most), or about 1000 values.
Returns a list of tuples:
(year, in_ia, count)
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 961b4759..d7c4e76e 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -388,8 +388,7 @@ accessible version.
<br>grouping other versions (eg, pre-print) and variants of this release
</div>
-{# this restriction, for CSL-JSON generation, rules out almost everything #}
-{% if release.contribs and release.contribs[0].creator_id %}
+{% if release._can_citeproc %}
<div class="ui segment attached accordion">
<div class="title" style="padding: 0px;">
<i class="dropdown icon"></i><b>Cite This Release</b>
diff --git a/python/tests/transform_csl.py b/python/tests/transform_csl.py
index 6f29cba7..15c64ce5 100644
--- a/python/tests/transform_csl.py
+++ b/python/tests/transform_csl.py
@@ -12,22 +12,22 @@ def test_csl_crossref(crossref_importer):
# not a single line
raw = json.loads(f.read())
r = crossref_importer.parse_record(raw)
- # this work has some null contrib names; these should cause errors
- with pytest.raises(ValueError):
- release_to_csl(r)
- with pytest.raises(ValueError):
- csl = release_to_csl(r)
- citeproc_csl(csl, 'csl-json')
- # set with dummy so we can run other tests
- for c in r.contribs:
- if not c.raw_name:
- c.raw_name = "dummy"
csl = release_to_csl(r)
citeproc_csl(csl, 'csl-json')
citeproc_csl(csl, 'bibtex')
citeproc_csl(csl, 'harvard1')
citeproc_csl(csl, 'harvard1', html=True)
+ # check that with no author surnames, can't run
+ for c in r.contribs:
+ c.raw_name = None
+ c.surname = None
+ with pytest.raises(ValueError):
+ release_to_csl(r)
+ with pytest.raises(ValueError):
+ csl = release_to_csl(r)
+ citeproc_csl(csl, 'csl-json')
+
def test_csl_pubmed(crossref_importer):
with open('tests/files/example_releases_pubmed19n0972.json', 'r') as f:
# multiple single lines
diff --git a/python/tests/web_citation_csl.py b/python/tests/web_citation_csl.py
index 3279ebea..e016b2d9 100644
--- a/python/tests/web_citation_csl.py
+++ b/python/tests/web_citation_csl.py
@@ -6,7 +6,7 @@ from fatcat_openapi_client.rest import ApiException
from fixtures import *
-def test_release_bibtex(app):
+def test_release_bibtex(app, api):
# "realistic" demo entity
rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam')
@@ -17,6 +17,8 @@ def test_release_bibtex(app):
assert b'@article{' in rv.data
rv = app.get('/release/ccccccccccccccccccccccccca.bib')
assert rv.status_code == 404
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=bibtex')
+ assert rv.status_code == 200
rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=csl-json')
assert rv.status_code == 200
# could also rv.get_json() here
@@ -25,10 +27,48 @@ def test_release_bibtex(app):
assert rv.status_code == 200
assert rv.data.decode('utf-8').startswith('Ioannidis, John. “Why Most Published Research Findings Are False”. 2.8 (2005)')
- # "dummy" demo entity
+ # "dummy" demo entity; very minimal metadata
rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai')
assert rv.status_code == 200
+ assert b'BibTeX' in rv.data
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib')
+ assert rv.status_code == 200
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=modern-language-association')
+ assert rv.status_code == 200
+ rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=csl-json')
+ assert rv.status_code == 200
+
+ # create release which can not have citeproc run on it (no authors)
+ eg = quick_eg(api)
+ r1 = ReleaseEntity(
+ title="some title",
+ ext_ids=ReleaseExtIds(),
+ )
+ r1edit = api.create_release(eg.editgroup_id, r1)
+ api.accept_editgroup(eg.editgroup_id)
+
+ rv = app.get('/release/{}'.format(r1edit.ident))
+ assert rv.status_code == 200
assert not b'BibTeX' in rv.data
with pytest.raises(ValueError):
- rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib')
+ rv = app.get('/release/{}.bib'.format(r1edit.ident))
+
+ # create release can have citeproc run on it (no authors)
+ eg = quick_eg(api)
+ r2 = ReleaseEntity(
+ title="some title again",
+ contribs=[
+ ReleaseContrib(
+ given_name="Paul",
+ surname="Otlet"),
+ ],
+ ext_ids=ReleaseExtIds(),
+ )
+ r2edit = api.create_release(eg.editgroup_id, r2)
+ api.accept_editgroup(eg.editgroup_id)
+ rv = app.get('/release/{}'.format(r2edit.ident))
+ assert rv.status_code == 200
+ assert b'BibTeX' in rv.data
+ rv = app.get('/release/{}.bib'.format(r2edit.ident))
+ assert rv.status_code == 200
diff --git a/python/tests/web_entity_views.py b/python/tests/web_entity_views.py
index 23a2b33b..a3f0f897 100644
--- a/python/tests/web_entity_views.py
+++ b/python/tests/web_entity_views.py
@@ -42,6 +42,8 @@ def test_entity_basics(app):
assert rv.status_code == 200
rv = app.get('/{}/rev/{}'.format(entity_type, revision))
assert rv.status_code == 200
+ rv = app.get('/{}/rev/{}_something'.format(entity_type, revision))
+ assert rv.status_code == 400
rv = app.get('/{}/rev/{}/metadata'.format(entity_type, revision))
assert rv.status_code == 200
print('/editgroup/aaaaaaaaaaaabo53aaaaaaaaaq/{}/{}'.format(entity_type, ident))
@@ -63,7 +65,7 @@ def test_entity_basics(app):
# TODO: redirects and deleted entities
def test_web_deleted_release(app, api):
- # specific regresion test for view of a deleted release
+ # specific regression test for view of a deleted release
# create release
eg = quick_eg(api)
diff --git a/python/tests/web_search.py b/python/tests/web_search.py
index 19e2c29f..24b817dc 100644
--- a/python/tests/web_search.py
+++ b/python/tests/web_search.py
@@ -75,7 +75,7 @@ def test_stats(app):
json=elastic_resp3.copy(), status=200)
rv = app.get('/stats')
assert rv.status_code == 200
- # TODO: probe these reponses better
+ # TODO: robe these responses better
@responses.activate
def test_stats_json(app):
@@ -112,7 +112,7 @@ def test_container_stats(app):
json=elastic_resp, status=200)
rv = app.get('/container/issnl/1234-5678/stats.json')
assert rv.status_code == 200
- # TODO: probe this reponse better
+ # TODO: probe this response better
# TODO: container stats
# TODO: container ISSN-L query