diff options
Diffstat (limited to 'python/tests')
-rw-r--r-- | python/tests/files/dblp.dtd | 351 | ||||
-rw-r--r-- | python/tests/files/example_dblp.xml | 66 | ||||
-rw-r--r-- | python/tests/files/example_dblp_article.xml | 14 | ||||
-rw-r--r-- | python/tests/import_dblp.py | 72 |
4 files changed, 503 insertions, 0 deletions
diff --git a/python/tests/files/dblp.dtd b/python/tests/files/dblp.dtd new file mode 100644 index 00000000..7fedacd2 --- /dev/null +++ b/python/tests/files/dblp.dtd @@ -0,0 +1,351 @@ +<!-- + The dblp computer science bibliography is copyright by Schloss Dagstuhl - Leibniz Center for Informatics. + + The metadata provided by dblp on its webpages, as well as their XML, JSON, RDF, RIS, BibTeX, and text export formats + available at our website, is released under the CC0 1.0 Public Domain Dedication license. That is, you are free to copy, + distribute, use, modify, transform, build upon, and produce derived works from our data, even for commercial purposes, + all without asking permission. Of course, we are always happy if you provide a link to us as the source of the data. + + Read the full CC0 1.0 legal code for the exact terms that apply: + + https://creativecommons.org/publicdomain/zero/1.0/legalcode + + A daily updated dblp XML record dump is available at: + + https://dblp.org/xml/dblp.xml.gz + + Persistent snapshots of the dblp XML record dumps are available at: + + https://dblp.org/xml/release/ + + Date of this DTD file: November 23, 2019 + + A changelog for this dblp.dtd can be found at: + + https://dblp.org/xml/CHANGES.txt + + For further details on the content of dblp XML records see: + + https://dblp.org/faq/16154937.html +--> + +<!ELEMENT dblp (article|inproceedings|proceedings|book|incollection| + phdthesis|mastersthesis|www|person|data)*> +<!ATTLIST dblp mdate CDATA #IMPLIED > + +<!ENTITY % field "author|editor|title|booktitle|pages|year|address|journal|volume|number|month|url|ee|cdrom|cite|publisher|note|crossref|isbn|series|school|chapter|publnr"> + +<!ELEMENT article (%field;)*> +<!ATTLIST article + key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + reviewid CDATA #IMPLIED + rating CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT inproceedings (%field;)*> +<!ATTLIST inproceedings key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT proceedings (%field;)*> +<!ATTLIST proceedings key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT book (%field;)*> +<!ATTLIST book key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT incollection (%field;)*> +<!ATTLIST incollection key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT phdthesis (%field;)*> +<!ATTLIST phdthesis key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT mastersthesis (%field;)*> +<!ATTLIST mastersthesis key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT www (%field;)*> +<!ATTLIST www key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT data (%field;)*> +<!ATTLIST data key CDATA #REQUIRED + mdate CDATA #IMPLIED + publtype CDATA #IMPLIED + cdate CDATA #IMPLIED +> + +<!ELEMENT person ((author*, (note|url|cite)*)|crossref) > +<!ATTLIST person key CDATA #REQUIRED + mdate CDATA #IMPLIED + cdate CDATA #IMPLIED + > + +<!ELEMENT author (#PCDATA)> +<!ATTLIST author + aux CDATA #IMPLIED + bibtex CDATA #IMPLIED + orcid CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT editor (#PCDATA)> +<!ATTLIST editor + aux CDATA #IMPLIED + orcid CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT address (#PCDATA)> +<!ATTLIST address + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ENTITY % titlecontents "#PCDATA|sub|sup|i|tt|ref"> +<!ELEMENT title (%titlecontents;)*> +<!ATTLIST title + bibtex CDATA #IMPLIED + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT booktitle (#PCDATA)> +<!ATTLIST booktitle + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT pages (#PCDATA)> +<!ATTLIST pages + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT year (#PCDATA)> +<!ATTLIST year + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT journal (#PCDATA)> +<!ATTLIST journal + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT volume (#PCDATA)> +<!ATTLIST volume + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT number (#PCDATA)> +<!ATTLIST number + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT month (#PCDATA)> +<!ATTLIST month + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT url (#PCDATA)> +<!ATTLIST url + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT ee (#PCDATA)> +<!ATTLIST ee + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT cite (#PCDATA)> +<!ATTLIST cite + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED + ref CDATA #IMPLIED +> + +<!ELEMENT school (#PCDATA)> +<!ATTLIST school + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT publisher (#PCDATA)> +<!ATTLIST publisher + href CDATA #IMPLIED + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT note (#PCDATA)> +<!ATTLIST note + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT cdrom (#PCDATA)> + +<!ELEMENT crossref (#PCDATA)> +<!ELEMENT isbn (#PCDATA)> +<!ATTLIST isbn + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT chapter (#PCDATA)> +<!ELEMENT series (#PCDATA)> +<!ATTLIST series + href CDATA #IMPLIED + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!ELEMENT publnr (#PCDATA) > +<!ATTLIST publnr + aux CDATA #IMPLIED + label CDATA #IMPLIED + type CDATA #IMPLIED +> + +<!-- sub elements of the title element --> +<!ELEMENT ref (#PCDATA)> +<!ATTLIST ref href CDATA #REQUIRED> +<!ELEMENT sup (%titlecontents;)*> +<!ELEMENT sub (%titlecontents;)*> +<!ELEMENT i (%titlecontents;)*> +<!ELEMENT tt (%titlecontents;)*> + +<!ENTITY reg "®"> +<!ENTITY micro "µ"> +<!ENTITY times "×"> + +<!-- (C) International Organization for Standardization 1986 + Permission to copy in any form is granted for use with + conforming SGML systems and applications as defined in + ISO 8879, provided this notice is included in all copies. +--> +<!-- Character entity set. Typical invocation: + <!ENTITY % HTMLlat1 PUBLIC + "ISO 8879-1986//ENTITIES Added Latin 1//EN//XML"> +--> +<!-- This version of the entity set can be used with any SGML document + which uses ISO 8859-1 or ISO 10646 as its document character + set. This includes XML documents and ISO HTML documents. +--> + + <!ENTITY Agrave "À" ><!-- capital A, grave accent --> + <!ENTITY Aacute "Á" ><!-- capital A, acute accent --> + <!ENTITY Acirc "Â" ><!-- capital A, circumflex accent --> + <!ENTITY Atilde "Ã" ><!-- capital A, tilde --> + <!ENTITY Auml "Ä" ><!-- capital A, dieresis or umlaut mark --> + <!ENTITY Aring "Å" ><!-- capital A, ring --> + <!ENTITY AElig "Æ" ><!-- capital AE diphthong (ligature) --> + <!ENTITY Ccedil "Ç" ><!-- capital C, cedilla --> + <!ENTITY Egrave "È" ><!-- capital E, grave accent --> + <!ENTITY Eacute "É" ><!-- capital E, acute accent --> + <!ENTITY Ecirc "Ê" ><!-- capital E, circumflex accent --> + <!ENTITY Euml "Ë" ><!-- capital E, dieresis or umlaut mark --> + <!ENTITY Igrave "Ì" ><!-- capital I, grave accent --> + <!ENTITY Iacute "Í" ><!-- capital I, acute accent --> + <!ENTITY Icirc "Î" ><!-- capital I, circumflex accent --> + <!ENTITY Iuml "Ï" ><!-- capital I, dieresis or umlaut mark --> + <!ENTITY ETH "Ð" ><!-- capital Eth, Icelandic --> + <!ENTITY Ntilde "Ñ" ><!-- capital N, tilde --> + <!ENTITY Ograve "Ò" ><!-- capital O, grave accent --> + <!ENTITY Oacute "Ó" ><!-- capital O, acute accent --> + <!ENTITY Ocirc "Ô" ><!-- capital O, circumflex accent --> + <!ENTITY Otilde "Õ" ><!-- capital O, tilde --> + <!ENTITY Ouml "Ö" ><!-- capital O, dieresis or umlaut mark --> + <!ENTITY Oslash "Ø" ><!-- capital O, slash --> + <!ENTITY Ugrave "Ù" ><!-- capital U, grave accent --> + <!ENTITY Uacute "Ú" ><!-- capital U, acute accent --> + <!ENTITY Ucirc "Û" ><!-- capital U, circumflex accent --> + <!ENTITY Uuml "Ü" ><!-- capital U, dieresis or umlaut mark --> + <!ENTITY Yacute "Ý" ><!-- capital Y, acute accent --> + <!ENTITY THORN "Þ" ><!-- capital THORN, Icelandic --> + <!ENTITY szlig "ß" ><!-- small sharp s, German (sz ligature) --> + <!ENTITY agrave "à" ><!-- small a, grave accent --> + <!ENTITY aacute "á" ><!-- small a, acute accent --> + <!ENTITY acirc "â" ><!-- small a, circumflex accent --> + <!ENTITY atilde "ã" ><!-- small a, tilde --> + <!ENTITY auml "ä" ><!-- small a, dieresis or umlaut mark --> + <!ENTITY aring "å" ><!-- small a, ring --> + <!ENTITY aelig "æ" ><!-- small ae diphthong (ligature) --> + <!ENTITY ccedil "ç" ><!-- small c, cedilla --> + <!ENTITY egrave "è" ><!-- small e, grave accent --> + <!ENTITY eacute "é" ><!-- small e, acute accent --> + <!ENTITY ecirc "ê" ><!-- small e, circumflex accent --> + <!ENTITY euml "ë" ><!-- small e, dieresis or umlaut mark --> + <!ENTITY igrave "ì" ><!-- small i, grave accent --> + <!ENTITY iacute "í" ><!-- small i, acute accent --> + <!ENTITY icirc "î" ><!-- small i, circumflex accent --> + <!ENTITY iuml "ï" ><!-- small i, dieresis or umlaut mark --> + <!ENTITY eth "ð" ><!-- small eth, Icelandic --> + <!ENTITY ntilde "ñ" ><!-- small n, tilde --> + <!ENTITY ograve "ò" ><!-- small o, grave accent --> + <!ENTITY oacute "ó" ><!-- small o, acute accent --> + <!ENTITY ocirc "ô" ><!-- small o, circumflex accent --> + <!ENTITY otilde "õ" ><!-- small o, tilde --> + <!ENTITY ouml "ö" ><!-- small o, dieresis or umlaut mark --> + + <!ENTITY oslash "ø" ><!-- small o, slash --> + <!ENTITY ugrave "ù" ><!-- small u, grave accent --> + <!ENTITY uacute "ú" ><!-- small u, acute accent --> + <!ENTITY ucirc "û" ><!-- small u, circumflex accent --> + <!ENTITY uuml "ü" ><!-- small u, dieresis or umlaut mark --> + <!ENTITY yacute "ý" ><!-- small y, acute accent --> + <!ENTITY thorn "þ" ><!-- small thorn, Icelandic --> + <!ENTITY yuml "ÿ" ><!-- small y, dieresis or umlaut mark --> + diff --git a/python/tests/files/example_dblp.xml b/python/tests/files/example_dblp.xml new file mode 100644 index 00000000..ac846d4f --- /dev/null +++ b/python/tests/files/example_dblp.xml @@ -0,0 +1,66 @@ +<?xml version="1.0" encoding="ISO-8859-1"?> +<!DOCTYPE dblp SYSTEM "dblp.dtd"> +<dblp> + +<!-- note these entries have been manipulated for testing --> + +<inproceedings key="conf/er/Norrie08" mdate="2008-10-20"> +<author>Moira C. Norrie</author> +<author>Michael H. Böhlen</author> +<title>PIM Meets Web 2.0.</title> +<pages>15-25</pages> +<year>2008</year> +<booktitle>ER</booktitle> +<ee>http://dx.doi.org/10.1007/978-3-540-87877-3 3</ee> +<crossref>conf/er/2008</crossref> +<url>db/conf/er/er2008.html#Norrie08</url> +</inproceedings> + +<proceedings key="conf/er/2008" mdate="2008-10-20"> +<editor>Qing Li</editor> +<editor>Stefano Spaccapietra</editor> +<editor>Eric Yu</editor> +<editor>Antoni Olivé</editor> +<title>Conceptual Modeling - ER 2008, 27th International Conference on Conceptual Modeling, Barcelona, Spain, October 20-24, 2008. Proceedings</title> +<volume>5231</volume> +<year>2008</year> +<isbn>978-3-540-87876-6</isbn> +<booktitle>ER</booktitle> +<series href="db/journals/lncs.html">Lecture Notes in Computer Science</series> +<publisher>Springer</publisher> +<url>db/conf/er/er2008.html</url> +</proceedings> + + +<article key="journals/cacm/Gentry10" mdate="2010-04-26"> +<author>Craig Gentry</author> +<title>Computing arbitrary functions of encrypted data.</title> +<pages>97-105</pages> +<year>2010</year> +<volume>53</volume> +<journal>Commun. ACM</journal> +<number>3</number> +<ee>http://doi.acm.org/10.1145/1666420.1666444</ee> +<url>db/journals/cacm/cacm53.html#Gentry10</url> +</article> + + +<inproceedings key="conf/focs/Yao82a" mdate="2011-10-19"> +<title>Theory and Applications of Trapdoor Functions (Extended Abstract)</title> +<author>Andrew Chi-Chih Yao</author> +<pages>80-91</pages> +<crossref>conf/focs/FOCS23</crossref> +<year>1982</year> +<booktitle>FOCS</booktitle> +<url>db/conf/focs/focs82.html#Yao82a</url> +<ee>http://doi.ieeecomputersociety.org/10.1109/SFCS.1982.45</ee> +</inproceedings> + + +<www mdate="2004-03-23" key="homepages/g/OdedGoldreich"> +<author>Oded Goldreich</author> +<title>Home Page</title> +<url>http://www.wisdom.weizmann.ac.il/~oded/</url> +</www> + +</dblp> diff --git a/python/tests/files/example_dblp_article.xml b/python/tests/files/example_dblp_article.xml new file mode 100644 index 00000000..d6b192b1 --- /dev/null +++ b/python/tests/files/example_dblp_article.xml @@ -0,0 +1,14 @@ +<article key="journals/cacm/Szalay08" mdate="2008-11-03"> +<author>Alexander S. Szalay</author> +<author>Michael H. Böhlen</author> +<author orcid="0000-0002-4354-9138">Nicolas Heist</author> +<author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author> +<title>Jim Gray, astronomer.</title> +<pages>58-65</pages> +<year>2008</year> +<volume>51</volume> +<journal>Commun. ACM</journal> +<number>11</number> +<ee>http://doi.acm.org/10.1145/1400214.1400231</ee> +<url>db/journals/cacm/cacm51.html#Szalay08</url> +</article> diff --git a/python/tests/import_dblp.py b/python/tests/import_dblp.py new file mode 100644 index 00000000..fd40eb06 --- /dev/null +++ b/python/tests/import_dblp.py @@ -0,0 +1,72 @@ + +import pytest +from bs4 import BeautifulSoup + +from fatcat_tools.importers import DblpReleaseImporter, Bs4XmlLargeFilePusher +from fixtures import * + + +@pytest.fixture(scope="function") +def dblp_importer(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DblpReleaseImporter(api, issn_file, bezerk_mode=True, lookup_refs=True) + +@pytest.fixture(scope="function") +def dblp_importer_existing(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield DblpReleaseImporter(api, issn_file, bezerk_mode=False, lookup_refs=True) + +def test_dblp_importer(dblp_importer): + last_index = dblp_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_dblp.xml', 'rb') as f: + dblp_importer.bezerk_mode = True + counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() + print(counts) + assert counts['insert'] == 3 + assert counts['exists'] == 0 + assert counts['skip'] == 1 + + # fetch most recent editgroup + change = dblp_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "dblp" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.DblpReleaseImporter" in eg.extra['agent'] + + # check that entity name mangling was fixed on import + eg = dblp_importer.api.get_editgroup(eg.editgroup_id) + release = dblp_importer.api.get_release(eg.edits.releases[0].ident) + assert release.contribs[1].raw_name == "Michael H. Böhlen" + + last_index = dblp_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/example_dblp.xml', 'rb') as f: + dblp_importer.bezerk_mode = False + dblp_importer.reset() + counts = Bs4XmlLargeFilePusher(dblp_importer, f, dblp_importer.ELEMENT_TYPES, use_lxml=True).run() + print(counts) + assert counts['insert'] == 0 + assert counts['exists'] == 3 + assert counts['skip'] == 1 + assert last_index == dblp_importer.api.get_changelog(limit=1)[0].index + +def test_dblp_xml_parse(dblp_importer): + with open('tests/files/example_dblp_article.xml', 'r') as f: + soup = BeautifulSoup(f, "xml") + r1 = dblp_importer.parse_record(soup.find_all("article")[0]) + + assert r1.title == "Jim Gray, astronomer" + assert r1.contribs[0].raw_name == "Alexander S. Szalay" + # tested above, in LXML import path + #assert r1.contribs[1].raw_name == "Michael H. Bohlen" + assert r1.contribs[2].raw_name == "Nicolas Heist" + # XXX: assert r1.contribs[2].extra['orcid'] == "0000-0002-4354-9138" + assert r1.contribs[3].raw_name == "Jens Lehmann" + assert r1.ext_ids.dblp == "journals/cacm/Szalay08" + assert r1.ext_ids.doi == "10.1145/1400214.1400231" + assert r1.pages == "58-65" + assert r1.issue == "11" + assert r1.volume == "51" + assert r1.release_year == 2008 + assert r1.extra['container_name'] == "Commun. ACM" + assert r1.extra['dblp']['type'] == "article" |