From c613297265f101e31fe915ac74bc80b43f6ffe00 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 25 Jul 2021 15:09:27 -0700 Subject: refs transform: many fixes - include year correctly (many cases) - test coverage for Crossref transform - pass-through 'edition' as 'version' - series-title parsed in to title or container as appropriate - missing release stage - fix 0-index vs. 1-index ref index field --- fatcat_scholar/transform.py | 43 ++++-- tests/files/example_crossref_record.json | 225 +++++++++++++++++++++++++++++++ tests/test_refs_transform.py | 50 ++++++- 3 files changed, 308 insertions(+), 10 deletions(-) create mode 100644 tests/files/example_crossref_record.json diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 22e2e8f..508b898 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -640,7 +640,7 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur pmid=ref.get("pmid"), pmcid=clean_pmcid(ref.get("pmcid")), arxiv_id=ref.get("arxiv_id"), - # isbn13: Optional[str] + isbn=ref.get("isbn"), url=clean_url_conservative(ref.get("url")), ), release_ident=release.ident, @@ -701,11 +701,12 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: pmid=extra.get("pmid"), pmcid=extra.get("pmcid"), arxiv_id=extra.get("arxiv_id"), - isbn13=extra.get("isbn13"), + isbn=extra.get("isbn13") or extra.get("isbn"), url=clean_url_conservative(extra.get("url")), ), release_ident=release.ident, work_ident=release.work_id, + release_stage=release.release_stage, release_year=release.release_year, index=ref_index, key=key or None, @@ -738,20 +739,41 @@ def refs_from_crossref( key = key.replace(record["DOI"], "") if key and key.startswith("ref-"): key = key[4:] + ref_title = ref.get("article-title") ref_container_name = ref.get("journal-title") if not ref_container_name: + ref_container_name = ref.get("container-title") + + # volume-title is often a book title + if not ref_title: + ref_title = ref.get("volume-title") + elif not ref_container_name: ref_container_name = ref.get("volume-title") + + # series-title is a bit weird in Crossref references. it is often + # passed alone and seems to be the article/book title miscategorized. + # other times it is a conference name. + series_title = ref.get("series-title") + if not ref_title: + ref_title = series_title + elif not ref_container_name: + ref_container_name = series_title + + year = ref.get("year") + if year and year.isdigit(): + year = int(year) + else: + year = None date = ref.get("date") - year = None - if date and len(date) >= 4 and date[:4].isdigit(): + if date and not year and len(date) >= 4 and date[:4].isdigit(): year = int(date[:4]) - if year < 1000 or year > 2100: - year = None + if year and (year < 1000 or year > 2100): + year = None output.append( RefStructured( biblio=RefBiblio( unstructured=ref.get("unstructured"), - title=ref.get("article-title"), + title=ref_title, subtitle=ref.get("subtitle"), contrib_raw_names=authors, year=year, @@ -759,15 +781,18 @@ def refs_from_crossref( publisher=ref.get("publisher"), volume=ref.get("volume"), issue=ref.get("issue"), - pages=ref.get("page"), + pages=ref.get("first-page"), + version=ref.get("edition"), doi=ref.get("DOI"), + isbn=ref.get("ISBN"), ), release_ident=release.ident, work_ident=release.work_id, + release_stage=release.release_stage, release_year=release.release_year, index=i + 1, # 1-indexed key=key or None, - locator=ref.get("first-page"), + #locator, target_release_id=None, ref_source=ref_source, ) diff --git a/tests/files/example_crossref_record.json b/tests/files/example_crossref_record.json new file mode 100644 index 0000000..d87c7c2 --- /dev/null +++ b/tests/files/example_crossref_record.json @@ -0,0 +1,225 @@ +{ + "doi": "10.1515/jpm-2019-0016", + "record": +{ + "DOI": "10.1111/his.12200", + "ISSN": [ + "0309-0167" + ], + "URL": "http://dx.doi.org/10.1111/his.12200", + "author": [ + { + "affiliation": [], + "family": "Stewart", + "given": "Colin J R" + } + ], + "container-title": [ + "Histopathology" + ], + "content-domain": { + "crossmark-restriction": false, + "domain": [] + }, + "created": { + "date-parts": [ + [ + 2013, + 6, + 3 + ] + ], + "date-time": "2013-06-03T16:37:56Z", + "timestamp": 1370277476000 + }, + "deposited": { + "date-parts": [ + [ + 2017, + 6, + 21 + ] + ], + "date-time": "2017-06-21T14:04:36Z", + "timestamp": 1498053876000 + }, + "indexed": { + "date-parts": [ + [ + 2020, + 7, + 28 + ] + ], + "date-time": "2020-07-28T14:37:55Z", + "timestamp": 1595947075455 + }, + "is-referenced-by-count": 0, + "issn-type": [ + { + "type": "print", + "value": "0309-0167" + } + ], + "issued": { + "date-parts": [ + [ + 2013, + 7 + ] + ] + }, + "license": [ + { + "URL": "http://doi.wiley.com/10.1002/tdm_license_1.1", + "content-version": "tdm", + "delay-in-days": 792, + "start": { + "date-parts": [ + [ + 2015, + 9, + 1 + ] + ], + "date-time": "2015-09-01T00:00:00Z", + "timestamp": 1441065600000 + } + } + ], + "link": [ + { + "URL": "https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1111%2Fhis.12200", + "content-type": "unspecified", + "content-version": "vor", + "intended-application": "text-mining" + } + ], + "member": "311", + "original-title": [], + "page": "n/a-n/a", + "prefix": "10.1111", + "published-online": { + "date-parts": [ + [ + 2013, + 7, + 16 + ] + ] + }, + "published-print": { + "date-parts": [ + [ + 2013, + 7 + ] + ] + }, + "publisher": "Wiley", + "reference": [ + { + "DOI": "10.5858/arpa.2012-0112-RA", + "article-title": "The separation of benign and malignant mesothelial proliferations", + "author": "Churg", + "doi-asserted-by": "crossref", + "first-page": "1217", + "journal-title": "Arch. Pathol. Lab. Med.", + "key": "10.1111/his.12200-BIB0001|his12200-cit-0001", + "volume": "136", + "year": "2012" + }, + { + "DOI": "10.1136/jcp.2010.086074", + "article-title": "Peritoneal mesothelial hyperplasia associated with gynaecological disease: a potential diagnostic pitfall that is commonly associated with endometriosis", + "author": "Opraka", + "doi-asserted-by": "crossref", + "first-page": "313", + "journal-title": "J. Clin. Pathol.", + "key": "10.1111/his.12200-BIB0002|his12200-cit-0002", + "volume": "64", + "year": "2011" + }, + { + "DOI": "10.1038/modpathol.2012.105", + "article-title": "Deciduoid mesothelioma: report of 21 cases with review of the literature", + "author": "Ordonez", + "doi-asserted-by": "crossref", + "first-page": "1481", + "journal-title": "Mod. Pathol.", + "key": "10.1111/his.12200-BIB0003|his12200-cit-0003", + "volume": "25", + "year": "2012" + }, + { + "DOI": "10.1111/j.1525-1438.2006.00509.x", + "article-title": "Atypical reactive ovarian surface epithelium, a pitfall in pathologic assessment", + "author": "Aydin", + "doi-asserted-by": "crossref", + "first-page": "207", + "issue": "Suppl. 1", + "journal-title": "Int. J. Gynecol. Cancer", + "key": "10.1111/his.12200-BIB0004|his12200-cit-0004", + "volume": "16", + "year": "2006" + }, + { + "DOI": "10.1097/PAP.0b013e3180ca7d7b", + "article-title": "The pathology of endometriosis: a survey of the many faces of a common disease emphasizing diagnostic pitfalls and unusual and newly appreciated aspects", + "author": "Clement", + "doi-asserted-by": "crossref", + "first-page": "241", + "journal-title": "Adv. Anat. Pathol.", + "key": "10.1111/his.12200-BIB0005|his12200-cit-0005", + "volume": "14", + "year": "2007" + }, + { + "article-title": "Extramedullary hematopoiesis associated with organizing peritoneal hemorrhage: a report of 5 cases in patients presenting with primary gynecological disorders", + "author": "Mesbah Ardakani", + "journal-title": "Int. J. Gynecol. Pathol.", + "key": "10.1111/his.12200-BIB0006|his12200-cit-0006" + }, + { + "key": "10.1016/B0-12-227090-8/00204-9_bib5", + "series-title": "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference", + "year": "2001" + }, + { + "key": "CIT0041", + "unstructured": "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6." + }, + { + "author": "L Piegl", + "edition": "2", + "key": "576_CR3", + "unstructured": "Piegl L, Tiller W (1997) The NURBS Book, Monographs in Visual Communication, 2nd edn. Springer, Berlin", + "volume-title": "The NURBS Book, Monographs in Visual Communication", + "year": "1997" + } + ], + "reference-count": 6, + "references-count": 6, + "relation": { + "cites": [] + }, + "score": null, + "short-container-title": [ + "Histopathology" + ], + "short-title": [], + "source": "Crossref", + "subject": [ + "Pathology and Forensic Medicine", + "Histology", + "General Medicine" + ], + "subtitle": [], + "title": [ + "Deciduoid mesothelial hyperplasia of the pelvic peritoneum" + ], + "type": "journal-article" +}, + "release_ident": "arzkbn5brjf2nitdy4fkiusc4q" +} + diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py index 5b48396..7cd46e8 100644 --- a/tests/test_refs_transform.py +++ b/tests/test_refs_transform.py @@ -1,7 +1,8 @@ +import json from fatcat_openapi_client import ReleaseEntity from fatcat_scholar.grobid2json import teixml2json -from fatcat_scholar.transform import refs_from_grobid +from fatcat_scholar.transform import refs_from_grobid, refs_from_crossref def test_transform_refs_grobid() -> None: @@ -40,3 +41,50 @@ def test_transform_refs_grobid() -> None: ref.biblio.unstructured == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) + +def test_transform_refs_crossref() -> None: + + with open("tests/files/example_crossref_record.json", "r") as f: + record = json.loads(f.read()) + + dummy_release = ReleaseEntity( + ident="releasedummy22222222222222", + work_id="workdummy22222222222222222", + release_year=1234, + release_stage="accepted", + ext_ids={}, + ) + + refs = refs_from_crossref(dummy_release, record) + + assert refs[0].release_ident == "releasedummy22222222222222" + assert refs[0].work_ident == "workdummy22222222222222222" + assert refs[0].release_stage == "accepted" + assert refs[0].release_year == 1234 + assert refs[0].ref_source == "crossref" + assert refs[0].key == "BIB0001|his12200-cit-0001" + assert refs[0].index == 1 + assert refs[0].locator is None + assert refs[0].biblio.contrib_raw_names is not None + assert refs[0].biblio.contrib_raw_names[0] == "Churg" + assert refs[0].biblio.container_name == "Arch. Pathol. Lab. Med." + assert refs[0].biblio.title == "The separation of benign and malignant mesothelial proliferations" + assert refs[0].biblio.year == 2012 + assert refs[0].biblio.pages == "1217" + assert refs[0].biblio.volume == "136" + assert refs[0].biblio.doi == "10.5858/arpa.2012-0112-RA" + assert refs[0].biblio.unstructured is None + + assert refs[6].biblio.title == "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference" + assert refs[6].biblio.year == 2001 + + assert refs[7].key == "CIT0041" + assert refs[7].biblio.unstructured == "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6." + + + assert refs[8].key == "576_CR3" + assert refs[8].biblio.unstructured is not None + assert refs[8].biblio.title == "The NURBS Book, Monographs in Visual Communication" + assert refs[8].biblio.year == 1997 + assert refs[8].biblio.version == "2" + -- cgit v1.2.3