summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-02 11:30:03 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-17 23:03:08 -0800
commitc66f9b2d98de88a98d3a1737d415bdab4e89027c (patch)
tree7d89d9b91c1fad8d436c8145c934c60ca074b7b5 /python
parenta57de2331caa6f92242dae6197b44926801b429a (diff)
downloadfatcat-c66f9b2d98de88a98d3a1737d415bdab4e89027c.tar.gz
fatcat-c66f9b2d98de88a98d3a1737d415bdab4e89027c.zip
basic test coverage of dblp release importer
Diffstat (limited to 'python')
-rw-r--r--python/tests/files/dblp.dtd351
-rw-r--r--python/tests/files/example_dblp.xml66
-rw-r--r--python/tests/files/example_dblp_article.xml14
-rw-r--r--python/tests/import_dblp.py72
4 files changed, 503 insertions, 0 deletions
diff --git a/python/tests/files/dblp.dtd b/python/tests/files/dblp.dtd
new file mode 100644
index 00000000..7fedacd2
--- /dev/null
+++ b/python/tests/files/dblp.dtd
@@ -0,0 +1,351 @@
+<!--
+ The dblp computer science bibliography is copyright by Schloss Dagstuhl - Leibniz Center for Informatics.
+
+ The metadata provided by dblp on its webpages, as well as their XML, JSON, RDF, RIS, BibTeX, and text export formats
+ available at our website, is released under the CC0 1.0 Public Domain Dedication license. That is, you are free to copy,
+ distribute, use, modify, transform, build upon, and produce derived works from our data, even for commercial purposes,
+ all without asking permission. Of course, we are always happy if you provide a link to us as the source of the data.
+
+ Read the full CC0 1.0 legal code for the exact terms that apply:
+
+ https://creativecommons.org/publicdomain/zero/1.0/legalcode
+
+ A daily updated dblp XML record dump is available at:
+
+ https://dblp.org/xml/dblp.xml.gz
+
+ Persistent snapshots of the dblp XML record dumps are available at:
+
+ https://dblp.org/xml/release/
+
+ Date of this DTD file: November 23, 2019
+
+ A changelog for this dblp.dtd can be found at:
+
+ https://dblp.org/xml/CHANGES.txt
+
+ For further details on the content of dblp XML records see:
+
+ https://dblp.org/faq/16154937.html
+-->
+
+<!ELEMENT dblp (article|inproceedings|proceedings|book|incollection|
+ phdthesis|mastersthesis|www|person|data)*>
+<!ATTLIST dblp mdate CDATA #IMPLIED >
+
+<!ENTITY % field "author|editor|title|booktitle|pages|year|address|journal|volume|number|month|url|ee|cdrom|cite|publisher|note|crossref|isbn|series|school|chapter|publnr">
+
+<!ELEMENT article (%field;)*>
+<!ATTLIST article
+ key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ reviewid CDATA #IMPLIED
+ rating CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT inproceedings (%field;)*>
+<!ATTLIST inproceedings key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT proceedings (%field;)*>
+<!ATTLIST proceedings key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT book (%field;)*>
+<!ATTLIST book key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT incollection (%field;)*>
+<!ATTLIST incollection key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT phdthesis (%field;)*>
+<!ATTLIST phdthesis key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT mastersthesis (%field;)*>
+<!ATTLIST mastersthesis key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT www (%field;)*>
+<!ATTLIST www key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT data (%field;)*>
+<!ATTLIST data key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ publtype CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+>
+
+<!ELEMENT person ((author*, (note|url|cite)*)|crossref) >
+<!ATTLIST person key CDATA #REQUIRED
+ mdate CDATA #IMPLIED
+ cdate CDATA #IMPLIED
+ >
+
+<!ELEMENT author (#PCDATA)>
+<!ATTLIST author
+ aux CDATA #IMPLIED
+ bibtex CDATA #IMPLIED
+ orcid CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT editor (#PCDATA)>
+<!ATTLIST editor
+ aux CDATA #IMPLIED
+ orcid CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT address (#PCDATA)>
+<!ATTLIST address
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ENTITY % titlecontents "#PCDATA|sub|sup|i|tt|ref">
+<!ELEMENT title (%titlecontents;)*>
+<!ATTLIST title
+ bibtex CDATA #IMPLIED
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT booktitle (#PCDATA)>
+<!ATTLIST booktitle
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT pages (#PCDATA)>
+<!ATTLIST pages
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT year (#PCDATA)>
+<!ATTLIST year
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT journal (#PCDATA)>
+<!ATTLIST journal
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT volume (#PCDATA)>
+<!ATTLIST volume
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT number (#PCDATA)>
+<!ATTLIST number
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT month (#PCDATA)>
+<!ATTLIST month
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT url (#PCDATA)>
+<!ATTLIST url
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT ee (#PCDATA)>
+<!ATTLIST ee
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT cite (#PCDATA)>
+<!ATTLIST cite
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+ ref CDATA #IMPLIED
+>
+
+<!ELEMENT school (#PCDATA)>
+<!ATTLIST school
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT publisher (#PCDATA)>
+<!ATTLIST publisher
+ href CDATA #IMPLIED
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT note (#PCDATA)>
+<!ATTLIST note
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT cdrom (#PCDATA)>
+
+<!ELEMENT crossref (#PCDATA)>
+<!ELEMENT isbn (#PCDATA)>
+<!ATTLIST isbn
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT chapter (#PCDATA)>
+<!ELEMENT series (#PCDATA)>
+<!ATTLIST series
+ href CDATA #IMPLIED
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!ELEMENT publnr (#PCDATA) >
+<!ATTLIST publnr
+ aux CDATA #IMPLIED
+ label CDATA #IMPLIED
+ type CDATA #IMPLIED
+>
+
+<!-- sub elements of the title element -->
+<!ELEMENT ref (#PCDATA)>
+<!ATTLIST ref href CDATA #REQUIRED>
+<!ELEMENT sup (%titlecontents;)*>
+<!ELEMENT sub (%titlecontents;)*>
+<!ELEMENT i (%titlecontents;)*>
+<!ELEMENT tt (%titlecontents;)*>
+
+<!ENTITY reg "&#174;">
+<!ENTITY micro "&#181;">
+<!ENTITY times "&#215;">
+
+<!-- (C) International Organization for Standardization 1986
+ Permission to copy in any form is granted for use with
+ conforming SGML systems and applications as defined in
+ ISO 8879, provided this notice is included in all copies.
+-->
+<!-- Character entity set. Typical invocation:
+ <!ENTITY % HTMLlat1 PUBLIC
+ "ISO 8879-1986//ENTITIES Added Latin 1//EN//XML">
+-->
+<!-- This version of the entity set can be used with any SGML document
+ which uses ISO 8859-1 or ISO 10646 as its document character
+ set. This includes XML documents and ISO HTML documents.
+-->
+
+ <!ENTITY Agrave "&#192;" ><!-- capital A, grave accent -->
+ <!ENTITY Aacute "&#193;" ><!-- capital A, acute accent -->
+ <!ENTITY Acirc "&#194;" ><!-- capital A, circumflex accent -->
+ <!ENTITY Atilde "&#195;" ><!-- capital A, tilde -->
+ <!ENTITY Auml "&#196;" ><!-- capital A, dieresis or umlaut mark -->
+ <!ENTITY Aring "&#197;" ><!-- capital A, ring -->
+ <!ENTITY AElig "&#198;" ><!-- capital AE diphthong (ligature) -->
+ <!ENTITY Ccedil "&#199;" ><!-- capital C, cedilla -->
+ <!ENTITY Egrave "&#200;" ><!-- capital E, grave accent -->
+ <!ENTITY Eacute "&#201;" ><!-- capital E, acute accent -->
+ <!ENTITY Ecirc "&#202;" ><!-- capital E, circumflex accent -->
+ <!ENTITY Euml "&#203;" ><!-- capital E, dieresis or umlaut mark -->
+ <!ENTITY Igrave "&#204;" ><!-- capital I, grave accent -->
+ <!ENTITY Iacute "&#205;" ><!-- capital I, acute accent -->
+ <!ENTITY Icirc "&#206;" ><!-- capital I, circumflex accent -->
+ <!ENTITY Iuml "&#207;" ><!-- capital I, dieresis or umlaut mark -->
+ <!ENTITY ETH "&#208;" ><!-- capital Eth, Icelandic -->
+ <!ENTITY Ntilde "&#209;" ><!-- capital N, tilde -->
+ <!ENTITY Ograve "&#210;" ><!-- capital O, grave accent -->
+ <!ENTITY Oacute "&#211;" ><!-- capital O, acute accent -->
+ <!ENTITY Ocirc "&#212;" ><!-- capital O, circumflex accent -->
+ <!ENTITY Otilde "&#213;" ><!-- capital O, tilde -->
+ <!ENTITY Ouml "&#214;" ><!-- capital O, dieresis or umlaut mark -->
+ <!ENTITY Oslash "&#216;" ><!-- capital O, slash -->
+ <!ENTITY Ugrave "&#217;" ><!-- capital U, grave accent -->
+ <!ENTITY Uacute "&#218;" ><!-- capital U, acute accent -->
+ <!ENTITY Ucirc "&#219;" ><!-- capital U, circumflex accent -->
+ <!ENTITY Uuml "&#220;" ><!-- capital U, dieresis or umlaut mark -->
+ <!ENTITY Yacute "&#221;" ><!-- capital Y, acute accent -->
+ <!ENTITY THORN "&#222;" ><!-- capital THORN, Icelandic -->
+ <!ENTITY szlig "&#223;" ><!-- small sharp s, German (sz ligature) -->
+ <!ENTITY agrave "&#224;" ><!-- small a, grave accent -->
+ <!ENTITY aacute "&#225;" ><!-- small a, acute accent -->
+ <!ENTITY acirc "&#226;" ><!-- small a, circumflex accent -->
+ <!ENTITY atilde "&#227;" ><!-- small a, tilde -->
+ <!ENTITY auml "&#228;" ><!-- small a, dieresis or umlaut mark -->
+ <!ENTITY aring "&#229;" ><!-- small a, ring -->
+ <!ENTITY aelig "&#230;" ><!-- small ae diphthong (ligature) -->
+ <!ENTITY ccedil "&#231;" ><!-- small c, cedilla -->
+ <!ENTITY egrave "&#232;" ><!-- small e, grave accent -->
+ <!ENTITY eacute "&#233;" ><!-- small e, acute accent -->
+ <!ENTITY ecirc "&#234;" ><!-- small e, circumflex accent -->
+ <!ENTITY euml "&#235;" ><!-- small e, dieresis or umlaut mark -->
+ <!ENTITY igrave "&#236;" ><!-- small i, grave accent -->
+ <!ENTITY iacute "&#237;" ><!-- small i, acute accent -->
+ <!ENTITY icirc "&#238;" ><!-- small i, circumflex accent -->
+ <!ENTITY iuml "&#239;" ><!-- small i, dieresis or umlaut mark -->
+ <!ENTITY eth "&#240;" ><!-- small eth, Icelandic -->
+ <!ENTITY ntilde "&#241;" ><!-- small n, tilde -->
+ <!ENTITY ograve "&#242;" ><!-- small o, grave accent -->
+ <!ENTITY oacute "&#243;" ><!-- small o, acute accent -->
+ <!ENTITY ocirc "&#244;" ><!-- small o, circumflex accent -->
+ <!ENTITY otilde "&#245;" ><!-- small o, tilde -->
+ <!ENTITY ouml "&#246;" ><!-- small o, dieresis or umlaut mark -->
+
+ <!ENTITY oslash "&#248;" ><!-- small o, slash -->
+ <!ENTITY ugrave "&#249;" ><!-- small u, grave accent -->
+ <!ENTITY uacute "&#250;" ><!-- small u, acute accent -->
+ <!ENTITY ucirc "&#251;" ><!-- small u, circumflex accent -->
+ <!ENTITY uuml "&#252;" ><!-- small u, dieresis or umlaut mark -->
+ <!ENTITY yacute "&#253;" ><!-- small y, acute accent -->
+ <!ENTITY thorn "&#254;" ><!-- small thorn, Icelandic -->
+ <!ENTITY yuml "&#255;" ><!-- small y, dieresis or umlaut mark -->
+
diff --git a/python/tests/files/example_dblp.xml b/python/tests/files/example_dblp.xml
new file mode 100644
index 00000000..ac846d4f
--- /dev/null
+++ b/python/tests/files/example_dblp.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!DOCTYPE dblp SYSTEM "dblp.dtd">
+<dblp>
+
+<!-- note these entries have been manipulated for testing -->
+
+<inproceedings key="conf/er/Norrie08" mdate="2008-10-20">
+<author>Moira C. Norrie</author>
+<author>Michael H. B&ouml;hlen</author>
+<title>PIM Meets Web 2.0.</title>
+<pages>15-25</pages>
+<year>2008</year>
+<booktitle>ER</booktitle>
+<ee>http://dx.doi.org/10.1007/978-3-540-87877-3 3</ee>
+<crossref>conf/er/2008</crossref>
+<url>db/conf/er/er2008.html#Norrie08</url>
+</inproceedings>
+
+<proceedings key="conf/er/2008" mdate="2008-10-20">
+<editor>Qing Li</editor>
+<editor>Stefano Spaccapietra</editor>
+<editor>Eric Yu</editor>
+<editor>Antoni Oliv&eacute;</editor>
+<title>Conceptual Modeling - ER 2008, 27th International Conference on Conceptual Modeling, Barcelona, Spain, October 20-24, 2008. Proceedings</title>
+<volume>5231</volume>
+<year>2008</year>
+<isbn>978-3-540-87876-6</isbn>
+<booktitle>ER</booktitle>
+<series href="db/journals/lncs.html">Lecture Notes in Computer Science</series>
+<publisher>Springer</publisher>
+<url>db/conf/er/er2008.html</url>
+</proceedings>
+
+
+<article key="journals/cacm/Gentry10" mdate="2010-04-26">
+<author>Craig Gentry</author>
+<title>Computing arbitrary functions of encrypted data.</title>
+<pages>97-105</pages>
+<year>2010</year>
+<volume>53</volume>
+<journal>Commun. ACM</journal>
+<number>3</number>
+<ee>http://doi.acm.org/10.1145/1666420.1666444</ee>
+<url>db/journals/cacm/cacm53.html#Gentry10</url>
+</article>
+
+
+<inproceedings key="conf/focs/Yao82a" mdate="2011-10-19">
+<title>Theory and Applications of Trapdoor Functions (Extended Abstract)</title>
+<author>Andrew Chi-Chih Yao</author>
+<pages>80-91</pages>
+<crossref>conf/focs/FOCS23</crossref>
+<year>1982</year>
+<booktitle>FOCS</booktitle>
+<url>db/conf/focs/focs82.html#Yao82a</url>
+<ee>http://doi.ieeecomputersociety.org/10.1109/SFCS.1982.45</ee>
+</inproceedings>
+
+
+<www mdate="2004-03-23" key="homepages/g/OdedGoldreich">
+<author>Oded Goldreich</author>
+<title>Home Page</title>
+<url>http://www.wisdom.weizmann.ac.il/~oded/</url>
+</www>
+
+</dblp>
diff --git a/python/tests/files/example_dblp_article.xml b/python/tests/files/example_dblp_article.xml
new file mode 100644
index 00000000..d6b192b1
--- /dev/null
+++ b/python/tests/files/example_dblp_article.xml
@@ -0,0 +1,14 @@
+<article key="journals/cacm/Szalay08" mdate="2008-11-03">
+<author>Alexander S. Szalay</author>
+<author>Michael H. B&ouml;hlen</author>
+<author orcid="0000-0002-4354-9138">Nicolas Heist</author>
+<author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author>
+<title>Jim Gray, astronomer.</title>
+<pages>58-65</pages>
+<year>2008</year>
+<volume>51</volume>
+<journal>Commun. ACM</journal>
+<number>11</number>
+<ee>http://doi.acm.org/10.1145/1400214.1400231</ee>
+<url>db/journals/cacm/cacm51.html#Szalay08</url>
+</article>
diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py
new file mode 100644
index 00000000..fd40eb06
--- /dev/null
+++ b/python/tests/import_dblp.py
@@ -0,0 +1,72 @@
+
+import pytest
+from bs4 import BeautifulSoup
+
+from fatcat_tools.importers import DblpReleaseImporter, Bs4XmlLargeFilePusher
+from fixtures import *
+
+
+@pytest.fixture(scope="function")
+def dblp_importer(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield DblpReleaseImporter(api, issn_file, bezerk_mode=True, lookup_refs=True)
+
+@pytest.fixture(scope="function")
+def dblp_importer_existing(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield DblpReleaseImporter(api, issn_file, bezerk_mode=False, lookup_refs=True)
+
+def test_dblp_importer(dblp_importer):
+ last_index = dblp_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_dblp.xml', 'rb') as f:
+ dblp_importer.bezerk_mode = True
+ counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
+ print(counts)
+ assert counts['insert'] == 3
+ assert counts['exists'] == 0
+ assert counts['skip'] == 1
+
+ # fetch most recent editgroup
+ change = dblp_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "dblp" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.DblpReleaseImporter" in eg.extra['agent']
+
+ # check that entity name mangling was fixed on import
+ eg = dblp_importer.api.get_editgroup(eg.editgroup_id)
+ release = dblp_importer.api.get_release(eg.edits.releases[0].ident)
+ assert release.contribs[1].raw_name == "Michael H. Böhlen"
+
+ last_index = dblp_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/example_dblp.xml', 'rb') as f:
+ dblp_importer.bezerk_mode = False
+ dblp_importer.reset()
+ counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run()
+ print(counts)
+ assert counts['insert'] == 0
+ assert counts['exists'] == 3
+ assert counts['skip'] == 1
+ assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index
+
+def test_dblp_xml_parse(dblp_importer):
+ with open('tests/files/example_dblp_article.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r1 = dblp_importer.parse_record(soup.find_all("article")[0])
+
+ assert r1.title == "Jim Gray, astronomer"
+ assert r1.contribs[0].raw_name == "Alexander S. Szalay"
+ # tested above, in LXML import path
+ #assert r1.contribs[1].raw_name == "Michael H. Bohlen"
+ assert r1.contribs[2].raw_name == "Nicolas Heist"
+ # XXX: assert r1.contribs[2].extra['orcid'] == "0000-0002-4354-9138"
+ assert r1.contribs[3].raw_name == "Jens Lehmann"
+ assert r1.ext_ids.dblp == "journals/cacm/Szalay08"
+ assert r1.ext_ids.doi == "10.1145/1400214.1400231"
+ assert r1.pages == "58-65"
+ assert r1.issue == "11"
+ assert r1.volume == "51"
+ assert r1.release_year == 2008
+ assert r1.extra['container_name'] == "Commun. ACM"
+ assert r1.extra['dblp']['type'] == "article"