diff options
| -rw-r--r-- | CONTRIBUTORS.md | 17 | ||||
| -rwxr-xr-x | python/fatcat_harvest.py | 2 | ||||
| -rwxr-xr-x | python/fatcat_import.py | 4 | ||||
| -rwxr-xr-x | python/fatcat_ingest.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/harvest/harvest_common.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 8 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/csl.py | 18 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/changelog.py | 26 | ||||
| -rwxr-xr-x | python/fatcat_transform.py | 2 | ||||
| -rw-r--r-- | python/fatcat_web/__init__.py | 2 | ||||
| -rw-r--r-- | python/fatcat_web/entity_helpers.py | 23 | ||||
| -rw-r--r-- | python/fatcat_web/search.py | 2 | ||||
| -rw-r--r-- | python/fatcat_web/templates/release_view.html | 3 | ||||
| -rw-r--r-- | python/tests/transform_csl.py | 20 | ||||
| -rw-r--r-- | python/tests/web_citation_csl.py | 46 | ||||
| -rw-r--r-- | python/tests/web_entity_views.py | 4 | ||||
| -rw-r--r-- | python/tests/web_search.py | 4 | 
19 files changed, 147 insertions, 46 deletions
| diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 414d1a96..8fe1a830 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,3 +1,16 @@ +Fatcat Contributors (alphabetically sorted) +============================================== -Special Thanks to Asheesh Laroia, who reviewed and gave excellent feedback on -the fatcat schemas, structure, and python library in November, 2018. +* [Edward Betts](https://github.com/EdwardBetts) + +* [Martin Czygan](https://github.com/miku) + +* [Asheesh Laroia](http://www.asheesh.org/) + +  * reviewed and gave excellent feedback on the fatcat schemas, structure, and python library in November, 2018. + +* [Bryan Newbold](https://bnewbold.net) + +* [Ellen Spertus](http://www.spertus.com/ellen/) + +  * IA Volunteer Summer 2018, wrote a Hadoop/Scala/Scalding matching job which resulted in millions of fatcat fulltext matches diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py index 151b025d..a45b44f8 100755 --- a/python/fatcat_harvest.py +++ b/python/fatcat_harvest.py @@ -83,7 +83,7 @@ def main():          help="Kafka topic namespace to use (eg, prod, qa, dev)")      parser.add_argument('--start-date',          default=None, type=mkdate, -        help="begining of harvest period") +        help="beginning of harvest period")      parser.add_argument('--end-date',          default=None, type=mkdate,          help="end of harvest period") diff --git a/python/fatcat_import.py b/python/fatcat_import.py index e1e06653..331cf791 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -416,7 +416,7 @@ def main():          help="whether postproc_status column must be '200'")      sub_arabesque_match.add_argument('--extid-type',          default="doi", -        help="identifer type in the database (eg, 'doi', 'pmcid'") +        help="identifier type in the database (eg, 'doi', 'pmcid'")      sub_arabesque_match.add_argument('--crawl-id',          help="crawl ID (optionally included in editgroup metadata)")      sub_arabesque_match.add_argument('--default-link-rel', @@ -424,7 +424,7 @@ def main():          help="default URL rel for matches (eg, 'publisher', 'web')")      sub_ingest_file = subparsers.add_parser('ingest-file-results', -        help="add/update flie entities linked to releases based on sandcrawler ingest results") +        help="add/update file entities linked to releases based on sandcrawler ingest results")      sub_ingest_file.set_defaults(          func=run_ingest_file,          auth_var="FATCAT_AUTH_WORKER_CRAWL", diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 9ba95015..c6f27ad3 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -185,7 +185,7 @@ def main():          help="list of Kafka brokers (host/port) to use")      parser.add_argument('--elasticsearch-endpoint',          default="https://search.fatcat.wiki", -        help="elasticsearch API. internal endpoint prefered, but public is default") +        help="elasticsearch API. internal endpoint preferred, but public is default")      parser.add_argument('--env',          default="dev",          help="Kafka topic namespace to use (eg, prod, qa, dev)") diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py index 310366bd..5f7aa084 100644 --- a/python/fatcat_tools/harvest/harvest_common.py +++ b/python/fatcat_tools/harvest/harvest_common.py @@ -133,7 +133,7 @@ class HarvestState:              def fail_fast(err, msg):                  if err:                      raise KafkaException(err) -            print("Commiting status to Kafka: {}".format(kafka_topic), file=sys.stderr) +            print("Committing status to Kafka: {}".format(kafka_topic), file=sys.stderr)              producer_conf = kafka_config.copy()              producer_conf.update({                  'delivery.report.only.error': True, @@ -164,7 +164,7 @@ class HarvestState:                  raise KafkaException(err)          conf = kafka_config.copy()          conf.update({ -            'group.id': 'dummy_init_group', # should never be commited +            'group.id': 'dummy_init_group', # should never be committed              'enable.auto.commit': False,              'auto.offset.reset': 'earliest',              'session.timeout.ms': 10000, diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index c000ad62..da611ecb 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -816,7 +816,7 @@ class KafkaJsonPusher(RecordPusher):          while True:              # Note: this is batch-oriented, because underlying importer is              # often batch-oriented, but this doesn't confirm that entire batch -            # has been pushed to fatcat before commiting offset. Eg, consider +            # has been pushed to fatcat before committing offset. Eg, consider              # case where there there is one update and thousands of creates;              # update would be lingering in importer, and if importer crashed              # never created. diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index bd070ef1..9617299c 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -9,7 +9,7 @@ import fatcat_openapi_client  from .common import EntityImporter, clean -# The docs/guide should be the cannonical home for these mappings; update there +# The docs/guide should be the canonical home for these mappings; update there  # first  # Can get a list of Crossref types (with counts) via API:  # https://api.crossref.org/works?rows=0&facet=type-name:* @@ -188,7 +188,7 @@ class CrossrefImporter(EntityImporter):              self.counts['skip-release-type'] += 1              return None -        # Do require the 'title' keys to exsit, as release entities do +        # Do require the 'title' keys to exist, as release entities do          if (not 'title' in obj) or (not obj['title']):              self.counts['skip-blank-title'] += 1              return None diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 5b736787..81f00876 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -3,7 +3,7 @@ Prototype importer for datacite.org data.  Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51 -Datacite being an aggregator, the data is heterogenous and exposes a couple of +Datacite being an aggregator, the data is heterogeneous and exposes a couple of  problems in content and structure. A few fields have their own parsing  functions (parse_datacite_...), which may help testing.  """ @@ -36,7 +36,7 @@ CONTAINER_TYPE_MAP = {      'Book Series': 'book-series',  } -# The docs/guide should be the cannonical home for these mappings; update there +# The docs/guide should be the canonical home for these mappings; update there  # first.  Map various datacite type types to CSL-ish types. None means TODO or  # remove.  DATACITE_TYPE_MAP = { @@ -228,7 +228,7 @@ class DataciteImporter(EntityImporter):      def lookup_ext_ids(self, doi):          """ -        Return dictionary of identifiers refering to the same things as the given DOI. +        Return dictionary of identifiers referring to the same things as the given DOI.          """          if self.extid_map_db is None:              return dict(core_id=None, @@ -584,7 +584,7 @@ class DataciteImporter(EntityImporter):          # Include certain relations from relatedIdentifiers. Keeping the          # original structure of data here, which is a list of dicts, with -        # relation type, identifer and identifier type (mostly). +        # relation type, identifier and identifier type (mostly).          relations = []          for rel in relIds:              if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py index 7ab94cac..832ad6aa 100644 --- a/python/fatcat_tools/transforms/csl.py +++ b/python/fatcat_tools/transforms/csl.py @@ -37,8 +37,9 @@ def release_to_csl(entity):              # Default to "local" (publication-specific) metadata; fall back to              # creator-level              family = contrib.surname or contrib.creator.surname or (contrib.raw_name and contrib.raw_name.split()[-1]) -            if not contrib.raw_name: -                raise ValueError("CSL requires some surname (family name)") +            if not family: +                # CSL requires some surname (family name) +                continue              c = dict(                  family=family,                  given=contrib.given_name or contrib.creator.given_name, @@ -49,22 +50,27 @@ def release_to_csl(entity):                  #static-ordering                  literal=contrib.raw_name or contrib.creator.display_name,                  #parse-names, -                role=contrib.role, +                # role must be defined; default to author +                role=contrib.role or 'author',              )          else:              family = contrib.surname or (contrib.raw_name and contrib.raw_name.split()[-1]) -            if not contrib.raw_name: -                raise ValueError("CSL requires some surname (family name)") +            if not family: +                # CSL requires some surname (family name) +                continue              c = dict(                  family=family,                  given=contrib.given_name,                  literal=contrib.raw_name, -                role=contrib.role, +                # role must be defined; default to author +                role=contrib.role or 'author',              )          for k in list(c.keys()):              if not c[k]:                  c.pop(k)          contribs.append(c) +    if not contribs: +        raise ValueError("citeproc requires at least one author with a surname")      abstract = None      if entity.abstracts:          abstract = entity.abstracts[0].content diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 5783bbfc..d1e7c2db 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -110,6 +110,32 @@ class EntityUpdatesWorker(FatcatWorker):              # the lancet (often hybrid OA)              "10.1016/s0140-6736",              "10.1016/s2213-2600", +            # journal of virology +            "10.1128/jvi.", +            # FEBS letters +            "10.1002/1873-3468.", +            # Journal of Neuroscience +            "10.1523/jneurosci.", +            # Chemical and pharmaceutical bulletin +            "10.1248/cpb.", +            # Japanese Journal of Radiological Technology +            "10.6009/jjrt.", +            # Seibutsu Butsuri +            "10.2142/biophys.", +            # Chemical Communications +            "10.1039/d0cc", +            # Yakugaku zasshi +            "10.1248/yakushi.", +            # bulletin AMS +            "10.1090/s0002-9904", +            # Current Biology +            "10.1016/j.cub.", +            # Antarctica A Keystone in a Changing World +            "10.3133/ofr", +            # Clinical Cancer Research +            "10.1158/1078-0432.", +            # Transactions of the Japan Society of Mechanical Engineers +            "10.1299/kikai",          ]      def want_live_ingest(self, release, ingest_request): diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index 23a56109..14595670 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -1,7 +1,7 @@  #!/usr/bin/env python3  """ -Utility script for doing bulk conversion/tranforms of entity JSON schema to +Utility script for doing bulk conversion/transforms of entity JSON schema to  other formats  """ diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py index 50757858..56a2e020 100644 --- a/python/fatcat_web/__init__.py +++ b/python/fatcat_web/__init__.py @@ -61,7 +61,7 @@ else:      print("No privileged token found")      priv_api = None -# TODO: refactor integration so this doesn't always need to be definied. If +# TODO: refactor integration so this doesn't always need to be defined. If  # key/secret are empty, library will not init; if init is skipped, get  # undefined errors elsewhere.  mwoauth = MWOAuth( diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py index 7a9830f9..7ac0f155 100644 --- a/python/fatcat_web/entity_helpers.py +++ b/python/fatcat_web/entity_helpers.py @@ -1,6 +1,6 @@  from flask import abort -from fatcat_openapi_client.rest import ApiException +from fatcat_openapi_client.rest import ApiException, ApiValueError  from fatcat_tools.transforms import *  from fatcat_web import app, api  from fatcat_web.search import get_elastic_container_stats, get_elastic_container_random_releases @@ -78,8 +78,13 @@ def enrich_release_entity(entity):              entity.subtitle = entity.extra['subtitle']      # author list to display; ensure it's sorted by index (any othors with      # index=None go to end of list) -    authors = [c for c in entity.contribs if c.role in ('author', None)] +    authors = [c for c in entity.contribs if +        c.role in ('author', None) and +        (c.surname or c.raw_name or (c.creator and c.creator.surname)) +    ]      entity._authors = sorted(authors, key=lambda c: (c.index == None and 99999999) or c.index) +    # need authors, title for citeproc to work +    entity._can_citeproc = bool(entity._authors) and bool(entity.title)      if entity.abstracts:          # hack to show plain text instead of latex abstracts          if 'latex' in entity.abstracts[0].mimetype: @@ -122,6 +127,8 @@ def generic_get_entity(entity_type, ident):              raise NotImplementedError      except ApiException as ae:          abort(ae.status) +    except ApiValueError: +        abort(400)  def generic_get_entity_revision(entity_type, revision_id):      try: @@ -143,6 +150,8 @@ def generic_get_entity_revision(entity_type, revision_id):              raise NotImplementedError      except ApiException as ae:          abort(ae.status) +    except ApiValueError: +        abort(400)  def generic_get_editgroup_entity(editgroup, entity_type, ident):      if entity_type == 'container': @@ -168,9 +177,15 @@ def generic_get_editgroup_entity(editgroup, entity_type, ident):              edit = e              break      if not revision_id: -        # couldn't find relevent edit in this editgroup +        # couldn't find relevant edit in this editgroup          abort(404) -    entity = generic_get_entity_revision(entity_type, revision_id) +    try: +        entity = generic_get_entity_revision(entity_type, revision_id) +    except ApiException as ae: +        abort(ae.status) +    except ApiValueError: +        abort(400) +      entity.ident = ident      return entity, edit diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 6b2b9cc1..c1246d22 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -299,7 +299,7 @@ def get_elastic_container_histogram(ident):      """      Fetches a stacked histogram of  -    Filters to the past 500 years (at most), or about 1000 vaules. +    Filters to the past 500 years (at most), or about 1000 values.      Returns a list of tuples:          (year, in_ia, count) diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index 961b4759..d7c4e76e 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -388,8 +388,7 @@ accessible version.    <br>grouping other versions (eg, pre-print) and variants of this release  </div> -{# this restriction, for CSL-JSON generation, rules out almost everything #} -{% if release.contribs and release.contribs[0].creator_id %} +{% if release._can_citeproc %}  <div class="ui segment attached accordion">    <div class="title" style="padding: 0px;">      <i class="dropdown icon"></i><b>Cite This Release</b> diff --git a/python/tests/transform_csl.py b/python/tests/transform_csl.py index 6f29cba7..15c64ce5 100644 --- a/python/tests/transform_csl.py +++ b/python/tests/transform_csl.py @@ -12,22 +12,22 @@ def test_csl_crossref(crossref_importer):          # not a single line          raw = json.loads(f.read())          r = crossref_importer.parse_record(raw) -    # this work has some null contrib names; these should cause errors -    with pytest.raises(ValueError): -        release_to_csl(r) -    with pytest.raises(ValueError): -        csl = release_to_csl(r) -        citeproc_csl(csl, 'csl-json') -    # set with dummy so we can run other tests -    for c in r.contribs: -        if not c.raw_name: -            c.raw_name = "dummy"      csl = release_to_csl(r)      citeproc_csl(csl, 'csl-json')      citeproc_csl(csl, 'bibtex')      citeproc_csl(csl, 'harvard1')      citeproc_csl(csl, 'harvard1', html=True) +    # check that with no author surnames, can't run +    for c in r.contribs: +        c.raw_name = None +        c.surname = None +    with pytest.raises(ValueError): +        release_to_csl(r) +    with pytest.raises(ValueError): +        csl = release_to_csl(r) +        citeproc_csl(csl, 'csl-json') +  def test_csl_pubmed(crossref_importer):      with open('tests/files/example_releases_pubmed19n0972.json', 'r') as f:          # multiple single lines diff --git a/python/tests/web_citation_csl.py b/python/tests/web_citation_csl.py index 3279ebea..e016b2d9 100644 --- a/python/tests/web_citation_csl.py +++ b/python/tests/web_citation_csl.py @@ -6,7 +6,7 @@ from fatcat_openapi_client.rest import ApiException  from fixtures import * -def test_release_bibtex(app): +def test_release_bibtex(app, api):      # "realistic" demo entity      rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam') @@ -17,6 +17,8 @@ def test_release_bibtex(app):      assert b'@article{' in rv.data      rv = app.get('/release/ccccccccccccccccccccccccca.bib')      assert rv.status_code == 404 +    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=bibtex') +    assert rv.status_code == 200      rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=csl-json')      assert rv.status_code == 200      # could also rv.get_json() here @@ -25,10 +27,48 @@ def test_release_bibtex(app):      assert rv.status_code == 200      assert rv.data.decode('utf-8').startswith('Ioannidis, John. “Why Most Published Research Findings Are False”. 2.8 (2005)') -    # "dummy" demo entity +    # "dummy" demo entity; very minimal metadata      rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai')      assert rv.status_code == 200 +    assert b'BibTeX' in rv.data +    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib') +    assert rv.status_code == 200 +    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=modern-language-association') +    assert rv.status_code == 200 +    rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai/citeproc?style=csl-json') +    assert rv.status_code == 200 + +    # create release which can not have citeproc run on it (no authors) +    eg = quick_eg(api) +    r1 = ReleaseEntity( +        title="some title", +        ext_ids=ReleaseExtIds(), +    ) +    r1edit = api.create_release(eg.editgroup_id, r1) +    api.accept_editgroup(eg.editgroup_id) + +    rv = app.get('/release/{}'.format(r1edit.ident)) +    assert rv.status_code == 200      assert not b'BibTeX' in rv.data      with pytest.raises(ValueError): -        rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai.bib') +        rv = app.get('/release/{}.bib'.format(r1edit.ident)) + +    # create release can have citeproc run on it (no authors) +    eg = quick_eg(api) +    r2 = ReleaseEntity( +        title="some title again", +        contribs=[ +            ReleaseContrib( +                given_name="Paul", +                surname="Otlet"), +        ], +        ext_ids=ReleaseExtIds(), +    ) +    r2edit = api.create_release(eg.editgroup_id, r2) +    api.accept_editgroup(eg.editgroup_id) +    rv = app.get('/release/{}'.format(r2edit.ident)) +    assert rv.status_code == 200 +    assert b'BibTeX' in rv.data +    rv = app.get('/release/{}.bib'.format(r2edit.ident)) +    assert rv.status_code == 200 diff --git a/python/tests/web_entity_views.py b/python/tests/web_entity_views.py index 23a2b33b..a3f0f897 100644 --- a/python/tests/web_entity_views.py +++ b/python/tests/web_entity_views.py @@ -42,6 +42,8 @@ def test_entity_basics(app):          assert rv.status_code == 200          rv = app.get('/{}/rev/{}'.format(entity_type, revision))          assert rv.status_code == 200 +        rv = app.get('/{}/rev/{}_something'.format(entity_type, revision)) +        assert rv.status_code == 400          rv = app.get('/{}/rev/{}/metadata'.format(entity_type, revision))          assert rv.status_code == 200          print('/editgroup/aaaaaaaaaaaabo53aaaaaaaaaq/{}/{}'.format(entity_type, ident)) @@ -63,7 +65,7 @@ def test_entity_basics(app):          # TODO: redirects and deleted entities  def test_web_deleted_release(app, api): -    # specific regresion test for view of a deleted release +    # specific regression test for view of a deleted release      # create release      eg = quick_eg(api) diff --git a/python/tests/web_search.py b/python/tests/web_search.py index 19e2c29f..24b817dc 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -75,7 +75,7 @@ def test_stats(app):          json=elastic_resp3.copy(), status=200)      rv = app.get('/stats')      assert rv.status_code == 200 -    # TODO: probe these reponses better +    # TODO: robe these responses better  @responses.activate  def test_stats_json(app): @@ -112,7 +112,7 @@ def test_container_stats(app):          json=elastic_resp, status=200)      rv = app.get('/container/issnl/1234-5678/stats.json')      assert rv.status_code == 200 -    # TODO: probe this reponse better +    # TODO: probe this response better  # TODO: container stats  # TODO: container ISSN-L query | 
