diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-18 20:21:49 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-28 23:07:31 +0100 | 
| commit | 403b1a2d4591d878145a021a7c1e15e2d60c47d8 (patch) | |
| tree | 082ddd601a58b25be4ee176fdda97f935e23ea4b /python | |
| parent | 76d6d4d2de6580ae147e40c43c18f04cc48b62ec (diff) | |
| download | fatcat-403b1a2d4591d878145a021a7c1e15e2d60c47d8.tar.gz fatcat-403b1a2d4591d878145a021a7c1e15e2d60c47d8.zip | |
improve datacite field mapping and import
Current version succeeded to import a random sample of 100000 records
(0.5%) from datacite.
The --debug (write JSON to stdout) and --insert-log-file (log batch
before committing to db) flags are temporary added to help debugging.
Add few unit tests.
Some edge cases:
a) Existing keys without value requires a slightly awkward:
```
titles = attributes.get('titles', []) or []
```
b) There can be 0, 1, or more (first one wins) titles.
c) Date handling is probably not ideal. Datacite has a potentiall fine
grained list of dates.
The test case (tests/files/datacite_sample.jsonl) refers to
https://ssl.fao.org/glis/doi/10.18730/8DYM9, which has date (main
descriptor) 1986. The datacite record contains: 2017 (publicationYear,
probably the year of record creation with reference system), 1978-06-03
(collected, e.g. experimental sample), 1986 ("Accepted"). The online
version of the resource knows even one more date (2019-06-05 10:14:43 by
WIEWS update).
Diffstat (limited to 'python')
| -rwxr-xr-x | python/fatcat_import.py | 15 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 180 | ||||
| -rw-r--r-- | python/tests/files/datacite_1k_records.jsonl.gz | bin | 0 -> 684605 bytes | |||
| -rw-r--r-- | python/tests/files/datacite_sample.jsonl | 1 | ||||
| -rw-r--r-- | python/tests/import_datacite.py | 108 | 
5 files changed, 245 insertions, 59 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index d7651792..90bb01a1 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -170,7 +170,10 @@ def run_datacite(args):      dci = DataciteImporter(args.api,          args.issn_map_file,          edit_batch_size=args.batch_size, -        bezerk_mode=args.bezerk_mode) +        bezerk_mode=args.bezerk_mode, +        debug=args.debug, +        lang_detect=args.lang_detect, +        insert_log_file=args.insert_log_file)      if args.kafka_mode:          KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",              "fatcat-import", consume_batch_size=args.batch_size).run() @@ -464,6 +467,16 @@ def main():      sub_datacite.add_argument('--bezerk-mode',          action='store_true',          help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") +    sub_datacite.add_argument('--debug', +        action='store_true', +        help="write converted JSON to stdout") +    sub_datacite.add_argument('--lang-detect', +        action='store_true', +        help="try to detect language (slow)") +    sub_datacite.add_argument('--insert-log-file', +        default='', +        type=str, +        help="write inserted documents into file (for debugging)")      sub_datacite.set_defaults(          func=run_datacite,          auth_var="FATCAT_API_AUTH_TOKEN", diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 4e117dde..9774e334 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -6,13 +6,14 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8  from .common import EntityImporter  import dateparser -import langcodes  import datetime -import langdetect  import fatcat_openapi_client +import hashlib  import json +import langcodes +import langdetect +import sqlite3  import sys -import hashlib  # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary  CONTAINER_TYPE_MAP = { @@ -147,10 +148,11 @@ LICENSE_SLUG_MAP = {  class DataciteImporter(EntityImporter):      """ -    Importer for datacite records. TODO(martin): Do we need issn_map_file? +    Importer for datacite records.      """ -    def __init__(self, api, issn_map_file, **kwargs): +    def __init__(self, api, issn_map_file, debug=False, lang_detect=False, +                 insert_log_file=None, **kwargs):          eg_desc = kwargs.get('editgroup_description',              "Automated import of Datacite DOI metadata, harvested from REST API") @@ -163,7 +165,42 @@ class DataciteImporter(EntityImporter):              **kwargs)          self.create_containers = kwargs.get('create_containers', True) +        extid_map_file = kwargs.get('extid_map_file') +        self.extid_map_db = None +        if extid_map_file: +            db_uri = "file:{}?mode=ro".format(extid_map_file) +            print("Using external ID map: {}".format(db_uri), file=sys.stderr) +            self.extid_map_db = sqlite3.connect(db_uri, uri=True) +        else: +            print("Not using external ID map", file=sys.stderr) +          self.read_issn_map_file(issn_map_file) +        self.debug = debug +        self.lang_detect = lang_detect +        self.insert_log_file = insert_log_file + +        print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr) + +    def lookup_ext_ids(self, doi): +        """ +        Return dictionary of identifiers refering to the same things as the given DOI. +        """ +        if self.extid_map_db is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) +        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", +            [doi.lower()]).fetchone() +        if row is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) +        row = [str(cell or '') or None for cell in row] +        return dict( +            core_id=row[0], +            pmid=row[1], +            pmcid=row[2], +            wikidata_qid=row[3], +            # TODO: +            arxiv_id=None, +            jstor_id=None, +        )      def parse_record(self, obj):          """ @@ -174,14 +211,14 @@ class DataciteImporter(EntityImporter):          attributes = obj['attributes'] -        # Contributors. Many nameIdentifierSchemes, we do not use yet: -        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ -        # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS", -        # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ], +        # Contributors. Many nameIdentifierSchemes, we do not use (yet): +        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": +        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", +        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].          contribs = []          for i, c in enumerate(attributes['creators']): -            if not c.get('nameType') == 'Personal': +            if 'nameType' in c and not c.get('nameType') == 'Personal':                  continue              creator_id = None              for nid in c.get('nameIdentifiers', []): @@ -191,7 +228,7 @@ class DataciteImporter(EntityImporter):                  if not orcid:                      continue                  creator_id = self.lookup_orcid(orcid) -                # If creator_id is None, should we create creators? +                # TODO(martin): If creator_id is None, should we create creators?              contribs.append(fatcat_openapi_client.ReleaseContrib(                  creator_id=creator_id,                  index=i, @@ -204,11 +241,27 @@ class DataciteImporter(EntityImporter):          # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"          title, subtitle = None, None -        for entry in attributes.get('titles', []): -            if not title and 'titleType' not in entry: -                title = entry.get('title').strip() -            if entry.get('titleType') == 'Subtitle': -                subtitle = entry.get('title').strip() +        titles = attributes.get('titles', []) or [] +        if len(titles) == 0: +            print('skipping record w/o title: {}'.format(obj), file=sys.stderr) +            return False +        elif len(titles) == 1: +            # We do not care about the type then. +            title = titles[0].get('title', '') or '' +            title = title.strip() +        else: +            for entry in titles: +                if not title and ('titleType' not in entry or not entry.get('titleType')): +                    title = entry.get('title').strip() +                if entry.get('titleType') == 'Subtitle': +                    subtitle = entry.get('title', '').strip() + +        if not title: +            print('skipping record w/o title: {}'.format(obj), file=sys.stderr) +            return False + +        if not subtitle: +            subtitle = None          # Dates. A few internal dates (registered, created, updated) and          # published (0..2554). We try to work with typed date list, in @@ -217,14 +270,13 @@ class DataciteImporter(EntityImporter):          # "Updated", "Valid".          release_year, release_date = None, None +        # Ignore: Collected, Issued.          date_type_prio = (              'Valid', -            'Issued',              'Available',              'Accepted',              'Submitted',              'Copyrighted', -            'Collected',              'Created',              'Updated',          ) @@ -233,15 +285,36 @@ class DataciteImporter(EntityImporter):              for item in dates:                  if not item.get('dateType') == prio:                      continue -                try: -                    result = dateparser.parse(item.get('date')) -                except TypeError as err: -                    print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr) -                    continue + +                # Parse out date, use common patterns first, fallback to dateparser. +                result, value, year_only = None, item.get('date', ''), False + +                # Before using (expensive) dateparser, try a few common patterns. +                common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S') + +                for pattern in common_patterns: +                    try: +                        result = datetime.datetime.strptime(value, pattern) +                    except ValueError: +                        continue +                    else: +                        if pattern == '%Y': +                            year_only = True +                        break + +                if result is None: +                    print('fallback for {}'.format(value), file=sys.stderr) +                    try: +                        result = dateparser.parse(value) +                    except TypeError as err: +                        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) +                        continue +                  if result is None:                      # Unparsable date.                      continue -                release_date = result +                if not year_only: +                    release_date = result.date()                  release_year = result.year                  if 1000 < release_year < datetime.date.today().year + 5:                      # Skip possibly bogus dates. @@ -280,10 +353,16 @@ class DataciteImporter(EntityImporter):                      container_id = self.lookup_issnl(issnl)                      if container_id is None and container.get('title'): +                        container_title = container.get('title') +                        if isinstance(container_title, list): +                            if len(container_title) > 0: +                                print('too many container titles: {}'.format(len(container_title))) +                                container_title = container_title[0] +                        assert isinstance(container_title, str)                          ce = fatcat_openapi_client.ContainerEntity(                              issnl=issnl,                              container_type=container_type, -                            name=container.get('title'), +                            name=container_title,                          )                          ce_edit = self.create_container(ce)                          container_id = ce_edit.ident @@ -326,12 +405,12 @@ class DataciteImporter(EntityImporter):          # closest, but not always supplied.          for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):              value = attributes.get('types', {}).get(typeType) -            release_type = DATACITE_TYPE_MAP.get(value) +            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)              if release_type is not None:                  break          if release_type is None: -            print("datacite unmapped type: {}".format(release_type), file=sys.stderr) +            print("no mapped type: {}".format(value), file=sys.stderr)          # Language values are varied ("ger", "es", "English", "ENG", "en-us",          # "other", ...). Try to crush it with langcodes: "It may sound to you @@ -347,7 +426,7 @@ class DataciteImporter(EntityImporter):              try:                  language = langcodes.get(value).language              except langcodes.tag_parser.LanguageTagError: -                print('could not determine language: {}'.format(value), file=sys.stderr) +                pass          # Abstracts appear in "attributes.descriptions[].descriptionType", some          # of the observed values: "Methods", "TechnicalInfo", @@ -355,8 +434,8 @@ class DataciteImporter(EntityImporter):          # "Other" fields might contain references or related articles (with          # DOI). TODO(martin): maybe try to parse out some of those refs.          abstracts = [] - -        for desc in attributes.get('descriptions', []): +        descs = attributes.get('descriptions', []) or [] +        for desc in descs:              if not desc.get('descriptionType') == 'Abstract':                  continue              if len(desc.get('description', '')) < 10: @@ -364,10 +443,11 @@ class DataciteImporter(EntityImporter):              text = desc.get('description')              sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()              lang = None -            try: -                lang = langdetect.detect(text) -            except langdetect.lang_detect_exception.LangDetectException: -                pass +            if self.lang_detect: +                try: +                    lang = langdetect.detect(text) +                except langdetect.lang_detect_exception.LangDetectException as err: +                    print('language detection failed: {}'.format(err), file=sys.stderr)              abstracts.append(fatcat_openapi_client.ReleaseAbstract(                  mimetype="text/plain",                  content=text, @@ -386,7 +466,8 @@ class DataciteImporter(EntityImporter):          # For the moment, we only care about References.          refs, ref_index = [], 0 -        for rel in attributes.get('relatedIdentifiers', []): +        relIds = attributes.get('relatedIdentifiers', []) or [] +        for rel in relIds:              if not rel.get('relationType') == 'References':                  continue              ref_extra = dict() @@ -422,6 +503,9 @@ class DataciteImporter(EntityImporter):          if extra_datacite:              extra['datacite'] = extra_datacite +        doi = attributes.get('doi', '').lower() +        extids = self.lookup_ext_ids(doi=doi) +          # Assemble release.          re = fatcat_openapi_client.ReleaseEntity(              work_id=None, @@ -435,7 +519,13 @@ class DataciteImporter(EntityImporter):              release_date=release_date,              publisher=publisher,              ext_ids=fatcat_openapi_client.ReleaseExtIds( -                doi=attributes.get('doi'), +                doi=doi, +                pmid=extids['pmid'], +                pmcid=extids['pmcid'], +                wikidata_qid=extids['wikidata_qid'], +                core=extids['core_id'], +                arxiv=extids['arxiv_id'], +                jstor=extids['jstor_id'],              ),              contribs=contribs,              volume=volume, @@ -449,11 +539,12 @@ class DataciteImporter(EntityImporter):          )          return re -    def try_update(self, re, debug=True): +    def try_update(self, re):          """ -        When debug is true, write the RE to stdout. +        When debug is true, write the RE to stdout, not to the database. Might +        hide schema mismatch bugs.          """ -        if debug is True: +        if self.debug is True:              print(json.dumps(re.to_dict(), default=extended_json_encoder))              return False @@ -476,10 +567,16 @@ class DataciteImporter(EntityImporter):          return True      def insert_batch(self, batch): +        print('inserting batch ({})'.format(len(batch)), file=sys.stderr) +        if self.insert_log_file: +            with open(self.insert_log_file, 'a') as f: +                for doc in batch: +                    json.dump(doc.to_dict(), f, default=extended_json_encoder) +                    f.write('\n')          self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(              editgroup=fatcat_openapi_client.Editgroup( -                description=self.editgroup_description, -                extra=self.editgroup_extra), +            description=self.editgroup_description, +            extra=self.editgroup_extra),              entity_list=batch))  def extended_json_encoder(value): @@ -491,6 +588,7 @@ def extended_json_encoder(value):          return value.isoformat()      if isinstance(value, set):          return list(value) +    raise TypeError('cannot encode type: {}'.format(type(value)))  def lookup_license_slug(raw):      """ diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gzBinary files differ new file mode 100644 index 00000000..28ea6e37 --- /dev/null +++ b/python/tests/files/datacite_1k_records.jsonl.gz diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl new file mode 100644 index 00000000..dba3e267 --- /dev/null +++ b/python/tests/files/datacite_sample.jsonl @@ -0,0 +1 @@ +{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.<br>Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.<br>Common name(s): Wheat<br>Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon<br>Local sample unique identifier: 97090<br>Method of creation: Acquisition<br>Date: 1986<br>Biological status: Traditional cultivar/landrace<br>Other identifiers: ICDW 20791<br>MLS status: Included<br>Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 0bbaba2e..9c542fc6 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -1,25 +1,99 @@  """  Test datacite importer. +""" -Datacite is a aggregator, hence inputs are quite varied. +import datetime +import pytest +import gzip +from fatcat_tools.importers import DataciteImporter, JsonLinePusher +from fixtures import api +import json -Here is small sample of ID types taken from a sample: -    497344 "DOI" -     65013 "URL" -     22210 "CCDC" -     17853 "GBIF" -     17635 "Other" -     11474 "uri" -      9170 "Publisher ID" -      7775 "URN" -      6196 "DUCHAS" -      5624 "Handle" -      5056 "publisherId" +@pytest.fixture(scope="function") +def datacite_importer(api): +    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: +        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', +                               bezerk_mode=True) -A nice tool, not yet existing tool (maybe named indigo) would do the following: +@pytest.fixture(scope="function") +def datacite_importer_existing(api): +    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: +        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', +                               bezerk_mode=False) -    $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md -TODO(martin): Write tests. -""" +@pytest.mark.skip(reason="larger datacite import slows tests down") +def test_datacite_importer_huge(datacite_importer): +    last_index = datacite_importer.api.get_changelog(limit=1)[0].index +    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: +        datacite_importer.bezerk_mode = True +        counts = JsonLinePusher(datacite_importer, f).run() +    assert counts['insert'] == 998 +    change = datacite_importer.api.get_changelog_entry(index=last_index+1) +    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) +    assert len(release.contribs) == 3 + + +def test_datacite_importer(datacite_importer): +    last_index = datacite_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/datacite_sample.jsonl', 'r') as f: +        datacite_importer.bezerk_mode = True +        counts = JsonLinePusher(datacite_importer, f).run() +    assert counts['insert'] == 1 +    assert counts['exists'] == 0 +    assert counts['skip'] == 0 + +    # fetch most recent editgroup +    change = datacite_importer.api.get_changelog_entry(index=last_index+1) +    eg = change.editgroup +    assert eg.description +    assert "datacite" in eg.description.lower() +    assert eg.extra['git_rev'] +    assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] + +    last_index = datacite_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/datacite_sample.jsonl', 'r') as f: +        datacite_importer.bezerk_mode = False +        datacite_importer.reset() +        counts = JsonLinePusher(datacite_importer, f).run() +    assert counts['insert'] == 0 +    assert counts['exists'] == 1 +    assert counts['skip'] == 0 +    assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index + +def test_datacite_dict_parse(datacite_importer): +    with open('tests/files/datacite_sample.jsonl', 'r') as f: +        raw = json.load(f) +        r = datacite_importer.parse_record(raw) +        # ensure the API server is ok with format +        JsonLinePusher(datacite_importer, [json.dumps(raw)]).run() + +        print(r.extra) +        assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" +        assert r.publisher == "International Centre for Agricultural Research in Dry Areas" +        assert r.release_type == "article" +        assert r.release_stage == "published" +        assert r.license_slug == None +        assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" +        assert r.ext_ids.doi == "10.18730/8dym9" +        assert r.ext_ids.isbn13 == None +        assert r.language == "enc" +        assert r.subtitle == None +        assert r.release_date == None +        assert r.release_year == 1986 +        assert 'subtitle' not in r.extra +        assert 'subtitle' not in r.extra['datacite'] +        assert 'funder' not in r.extra +        assert 'funder' not in r.extra['datacite'] +        # matched by ISSN, so shouldn't be in there +        #assert extra['container_name'] == "International Journal of Quantum Chemistry" +        assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9' +        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] +        assert len(r.abstracts) == 1 +        assert len(r.abstracts[0].content) == 421 +        assert len(r.contribs) == 1 +        assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA" +        assert r.contribs[0].given_name == None +        assert r.contribs[0].surname == None +        assert len(r.refs) == 0 | 
