diff options
-rw-r--r-- | fuzzycat/verify.py | 2 | ||||
-rw-r--r-- | tests/data/release/osgdmzg32jcr5ngh5p4laeeira | 462 | ||||
-rw-r--r-- | tests/data/release/r65jidi26jbpnf2eafbo3qi5am | 51 | ||||
-rw-r--r-- | tests/data/release/zh4tnlduu5bmngeipxaftaqwqq | 31 | ||||
-rw-r--r-- | tests/data/verify.csv | 3 |
5 files changed, 548 insertions, 1 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 9b29f81..21c1a15 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -465,7 +465,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: # preprint and published work may not be published in the same # year; compromise allow a small gap if a_release_year and b_release_year and abs(int(a_release_year) - - int(b_release_year)) > 2: + int(b_release_year)) > 4: return Verify(Status.DIFFERENT, Reason.YEAR) return Verify(Status.EXACT, Reason.TITLE_AUTHOR_MATCH) diff --git a/tests/data/release/osgdmzg32jcr5ngh5p4laeeira b/tests/data/release/osgdmzg32jcr5ngh5p4laeeira new file mode 100644 index 0000000..7803a80 --- /dev/null +++ b/tests/data/release/osgdmzg32jcr5ngh5p4laeeira @@ -0,0 +1,462 @@ +{ + "abstracts": [], + "container_id": "i6iajiiyxvgz3ob6jwcn2pufni", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_affiliation": "University of Alberta, Edmonton, Canada", + "raw_name": "Abram Hindle", + "role": "author" + }, + { + "index": 1, + "raw_affiliation": "University College London, United Kingdom", + "raw_name": "Earl T. Barr", + "role": "author" + }, + { + "index": 2, + "raw_affiliation": "UC Davis, CA", + "raw_name": "Mark Gabel", + "role": "author" + }, + { + "index": 3, + "raw_affiliation": "UC Davis, CA", + "raw_name": "Zhendong Su", + "role": "author" + }, + { + "index": 4, + "raw_affiliation": "UC Davis, CA", + "raw_name": "Premkumar Devanbu", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1145/2902362" + }, + "extra": { + "crossref": { + "license": [ + { + "URL": "http://www.acm.org/publications/policies/copyright_policy#Background", + "content-version": "vor", + "delay-in-days": 0, + "start": "2016-04-26T00:00:00Z" + } + ], + "type": "journal-article" + } + }, + "ident": "osgdmzg32jcr5ngh5p4laeeira", + "language": "en", + "pages": "122-131", + "publisher": "Association for Computing Machinery (ACM)", + "refs": [ + { + "extra": { + "doi": "10.1145/2635868.2635901", + "unstructured": "Allamanis, M., Sutton, C. Mining idioms from source code. InFSE. ACM, 2014." + }, + "index": 0, + "key": "key-10.1145/2902362-1" + }, + { + "extra": { + "unstructured": "Antoniol, G., Canfora, G., Casazza, G., Lucia, A.D., Merlo, E. Recovering traceability links between code and documentation.IEEE Trans. Softw. Eng. 28(2002), 970--983." + }, + "index": 1, + "key": "key-10.1145/2902362-2" + }, + { + "extra": { + "unstructured": "Arnold, S., Mark, L., Goldthwaite, J. Programming by voice, Vocal Programming. InProceedings, ACM Conference on Assistive Technologies.ACM, 2000, 149--155." + }, + "index": 2, + "key": "key-10.1145/2902362-3" + }, + { + "extra": { + "doi": "10.1109/vlhcc.2004.49", + "unstructured": "Begel, A. Spoken language support for software development. InProceedings, VL/HCC.IEEE Computer Society, 2004, 271--272." + }, + "index": 3, + "key": "key-10.1145/2902362-4" + }, + { + "extra": { + "unstructured": "Bellegarda, J. Statistical language model adaptation: Review and perspectives.Speech Commun. 42, 1 (2004), 93--108." + }, + "index": 4, + "key": "key-10.1145/2902362-5" + }, + { + "extra": { + "doi": "10.1145/1985441.1985471", + "unstructured": "Binkley, D., Hearn, M., Lawrie, D. Improving identifier informativeness using part of speech information. InProceedings, MSR.ACM, 2011." + }, + "index": 5, + "key": "key-10.1145/2902362-6" + }, + { + "extra": { + "unstructured": "Bruch, M., Bodden, E., Monperrus, M., Mezini, M. Ide 2.0: Collective intelligence in software development. InProceedings of the FSE/SDP workshop on Future of Software Engineering Research.ACM, 2010, 53--58." + }, + "index": 6, + "key": "key-10.1145/2902362-7" + }, + { + "extra": { + "doi": "10.1145/1595696.1595728", + "unstructured": "Bruch, M., Monperrus, M., Mezini, M. Learning from examples to improve code completion systems. InProceedings, ACM SIGSOFT ESEC/FSE, 2009." + }, + "index": 7, + "key": "key-10.1145/2902362-8" + }, + { + "extra": { + "doi": "10.1145/1858996.1859005", + "unstructured": "Buse, R., Weimer, W. Automatically documenting program changes. InProceedings, ASE.ACM, 2010, 33--42." + }, + "index": 8, + "key": "key-10.1145/2902362-9" + }, + { + "extra": { + "unstructured": "Campbell, J.C., Hindle, A., Amaral, J.N. Syntax errors just aren't natural: Improving error reporting with language models. InMSR.ACM, 2014." + }, + "index": 9, + "key": "key-10.1145/2902362-10" + }, + { + "extra": { + "unstructured": "Franks, C., Tu, Z., Devanbu, P., Hellendoorn, V. Cacheca: A cache language model based code suggestion tool.ICSE Demonst. Track(2015)." + }, + "index": 10, + "key": "key-10.1145/2902362-11" + }, + { + "extra": { + "unstructured": "Gabel, M., Su, Z. Javert: Fully automatic mining of general temporal properties from dynamic traces. InProceedings, ACM SIGSOFT FSE.ACM, 2008, 339--349." + }, + "index": 11, + "key": "key-10.1145/2902362-12" + }, + { + "extra": { + "doi": "10.1145/1882291.1882315", + "unstructured": "Gabel, M., Su, Z. A study of the uniqueness of source code. InProceedings, ACM SIGSOFT FSE.ACM, 2010, 147--156." + }, + "index": 12, + "key": "key-10.1145/2902362-13" + }, + { + "extra": { + "doi": "10.1109/ase.2009.64", + "unstructured": "Han, S., Wallace, D.R., Miller, R.C. Code completion from abbreviated input. InASE.IEEE Computer Society, 2009, 332--343." + }, + "index": 13, + "key": "key-10.1145/2902362-14" + }, + { + "extra": { + "unstructured": "Høst, E.W., Østvold, B.M.Software Language Engineering. Chapter The Java Programmer's Phrase Book.Springer-Verlag, Berlin, Heidelberg, 2009." + }, + "index": 14, + "key": "key-10.1145/2902362-15" + }, + { + "extra": { + "doi": "10.1007/978-3-642-03013-0_14", + "unstructured": "Høst, E., Østvold, B. Debugging method names. InProceedings ECOOP.Springer, 2009, 294--317." + }, + "index": 15, + "key": "key-10.1145/2902362-16" + }, + { + "extra": { + "doi": "10.1109/icsm.2011.6080790", + "unstructured": "Hou, D. Pletcher, D. An evaluation of the strategies of sorting, filtering, and grouping API methods for code completion. InProceedings, ICSM, 2011." + }, + "index": 16, + "key": "key-10.1145/2902362-17" + }, + { + "extra": { + "unstructured": "Hubbell, T., Langan, D, Hain, T. A voice-activated syntax-directed editor for manually disabled programme RS. InProceedings, ACM SIGACCESS.ACM, 2006." + }, + "index": 17, + "key": "key-10.1145/2902362-18" + }, + { + "extra": { + "doi": "10.1145/1900008.1900143", + "unstructured": "Jacob, F., Tairas, R. Code template inference using language models. InProceedings of the 48th Annual Southeast Regional Conference, 2010." + }, + "index": 18, + "key": "key-10.1145/2902362-19" + }, + { + "extra": { + "doi": "10.1145/2661136.2661148", + "unstructured": "Karaivanov, S., Raychev, V., Vechev, M. Phrase-based statistical translation of programming languages. InOnward!ACM, 2014, 173--184." + }, + "index": 19, + "key": "key-10.1145/2902362-20" + }, + { + "extra": { + "doi": "10.1145/1181775.1181781", + "unstructured": "Kim, S., Pan, K., Whitehead, E. Jr. Memories of bug fixes. InProceedings, ACM SIGSOFT FSE.ACM, 2006, 35--45." + }, + "index": 20, + "key": "key-10.1145/2902362-21" + }, + { + "extra": { + "doi": "10.1145/1357054.1357127", + "unstructured": "Kittur, A., Chi, E., Suh, B. Crowdsourcing user studies with mechanical turk. InProceedings, CHI.ACM, 2008." + }, + "index": 21, + "key": "key-10.1145/2902362-22" + }, + { + "extra": { + "unstructured": "Knuth, D.E. Literate programming.Comput. J. 21, 2 (1984), 97--111." + }, + "index": 22, + "key": "key-10.1145/2902362-23" + }, + { + "extra": { + "unstructured": "Koehn, P.Statistical Machine Translation.Cambridge University Press, 2010." + }, + "index": 23, + "key": "key-10.1145/2902362-24" + }, + { + "extra": { + "doi": "10.1109/icse.2005.1553580", + "unstructured": "Konrad, S., Cheng, B. Real-time specification patterns. InProceedings ICSE, 2005." + }, + "index": 24, + "key": "key-10.1145/2902362-25" + }, + { + "extra": { + "unstructured": "Körner, S.J., Tichy, W.F. Text to software. InProceedings of FSE/SDP Workshop on the Future of Software Engineering Research.ACM, Nov. 2010." + }, + "index": 25, + "key": "key-10.1145/2902362-26" + }, + { + "extra": { + "unstructured": "Lawrie, D., Morrell, C., field, H., Binkley, D. What's in a name? A study of identifiers.Proceedings, ICPC, 2006." + }, + "index": 26, + "key": "key-10.1145/2902362-27" + }, + { + "extra": { + "unstructured": "Linstead, E., Bajracharya, S., Ngo, T., Rigor, P., Lopes, C., Baldi, P. Sourcerer: Mining and searching internet-scale software repositories.Data Mining Knowl. Discov. 18, 2 (2009), 300--336." + }, + "index": 27, + "key": "key-10.1145/2902362-28" + }, + { + "extra": { + "unstructured": "Livshits, B., Zimmermann, T. Dynamine: Finding common error patterns by mining software revision histories.ACM SIGSOFT Softw. Eng. Notes 30, 5 (2005)." + }, + "index": 28, + "key": "key-10.1145/2902362-29" + }, + { + "extra": { + "unstructured": "Mandelin, D., Xu, L., Bodík, R., Kimelman, D. Jungloid mining: Helping to navigate the API jungle. InACM SIGPLAN Notices.Volume 40. ACM, 2005." + }, + "index": 29, + "key": "key-10.1145/2902362-30" + }, + { + "extra": { + "unstructured": "Manning, C., Schütze, H.Foundations of Statistical Natural Language Processing.Volume 59. MIT Press, 1999." + }, + "index": 30, + "key": "key-10.1145/2902362-31" + }, + { + "extra": { + "unstructured": "Marcus, M., Marcinkiewicz, M., Santorini, B. Building a large annotated corpus of English: The Penn treebank.Comput. Linguist. 19, 2 (1993), 313--330." + }, + "index": 31, + "key": "key-10.1145/2902362-32" + }, + { + "extra": { + "doi": "10.1109/wac.2006.375941", + "unstructured": "Mills, S., Saadat, S., Whiting, D. Is voice recognition the solution to keyboard-based RSI? InAutomation Congress, 2006. WAC'06. World, 2006." + }, + "index": 32, + "key": "key-10.1145/2902362-33" + }, + { + "extra": { + "doi": "10.1145/2642937.2643010", + "unstructured": "Nguyen, A.T., Nguyen, H.A., Nguyen, T.T., Nguyen, T.N. Statistical learning approach for mining API usage mappings for code migration. InASE, 2014." + }, + "index": 33, + "key": "key-10.1145/2902362-34" + }, + { + "extra": { + "unstructured": "Nguyen, A.T., Nguyen, T.N. Graph-based statistical language model for code. 2015." + }, + "index": 34, + "key": "key-10.1145/2902362-35" + }, + { + "extra": { + "doi": "10.1145/2491411.2494584", + "unstructured": "Nguyen, A.T., Nguyen, T.T., Nguyen, T.N. Lexical statistical machine translation for language migration. InFSE, 2013." + }, + "index": 35, + "key": "key-10.1145/2902362-36" + }, + { + "extra": { + "doi": "10.1145/2591062.2591072", + "unstructured": "Nguyen, A.T., Nguyen, T.T., Nguyen, T.N. Migrating code with statistical machine translation. InICSE Companion, 2014." + }, + "index": 36, + "key": "key-10.1145/2902362-37" + }, + { + "extra": { + "doi": "10.1145/2491411.2491458", + "unstructured": "Nguyen, T.T., Nguyen, A.T., Nguyen, H.A., Nguyen, T.N. A statistical semantic language model for source code. InESEC FSE.ACM, 2013." + }, + "index": 37, + "key": "key-10.1145/2902362-38" + }, + { + "extra": { + "unstructured": "Pinker, S.The Language Instinct: The New Science of Language and Mind.Volume 7529. Penguin, London, UK, 1994." + }, + "index": 38, + "key": "key-10.1145/2902362-39" + }, + { + "extra": { + "unstructured": "Rastkar, S., Murphy, G.C., Bradley, A. Generating natural language summaries for cross-cutting source code concerns. InProceedings, ICSM, 2011." + }, + "index": 39, + "key": "key-10.1145/2902362-40" + }, + { + "extra": { + "doi": "10.1145/2676726.2677009", + "unstructured": "Raychev, V., Vechev, M., Krause, A. Predicting program properties from big code. InPOPL.ACM, 2015." + }, + "index": 40, + "key": "key-10.1145/2902362-41" + }, + { + "extra": { + "unstructured": "Robbes, R., Lanza, M. Improving code completion with program history.Autom. Softw. Eng. 17, 2 (2010), 181--212." + }, + "index": 41, + "key": "key-10.1145/2902362-42" + }, + { + "extra": { + "doi": "10.1007/bfb0035136", + "unstructured": "Rolland, C., Proix, C. A natural language approach for requirements engineering. InAdvanced Information Systems Engineering.Springer, 1992, 257--277." + }, + "index": 42, + "key": "key-10.1145/2902362-43" + }, + { + "extra": { + "doi": "10.1145/1218563.1218587", + "unstructured": "Shepherd, D., Fry, Z., Hill, E., Pollock, L., Vijay-Shanker, K. Using natural language program analysis to locate and understand action-oriented concerns. InProceedings, AOSD.ACM, 2007, 212--224." + }, + "index": 43, + "key": "key-10.1145/2902362-44" + }, + { + "extra": { + "doi": "10.1145/1082983.1083129", + "unstructured": "Shepherd, D., Pollock, L., Tourwé, T. Using language clues to discover crosscutting concerns. InACM SIGSOFT Software Engineering Notes.Volume 30. ACM, 2005." + }, + "index": 44, + "key": "key-10.1145/2902362-45" + }, + { + "extra": { + "unstructured": "Sparck-Jones, K. Natural language processing: A historical review.Current Issues in Computational Linguistics: In Honour of Don Walker (Ed Zampolli, Calzolari and Palmer).Kluwer, Amsterdam, the Netherlands, 1994." + }, + "index": 45, + "key": "key-10.1145/2902362-46" + }, + { + "extra": { + "doi": "10.1145/1858996.1859006", + "unstructured": "Sridhara, G., Hill, E., Muppaneni, D., Pollock, L., Vijay-Shanker, K. Towards automatically generating summary comments for Java methods. InProceedings, ASE, 2010." + }, + "index": 46, + "key": "key-10.1145/2902362-47" + }, + { + "extra": { + "doi": "10.1145/1985793.1985808", + "unstructured": "Sridhara, G., Pollock, L., Vijay-Shanker, K. Automatically detecting and describing high level actions within methods. InProceedings, ICSE, 2011." + }, + "index": 47, + "key": "key-10.1145/2902362-48" + }, + { + "extra": { + "doi": "10.1145/2635868.2635875", + "unstructured": "Tu, Z., Su, Z., Devanbu, P. On the localness of software. InFSE.ACM, 2014, 269--280." + }, + "index": 48, + "key": "key-10.1145/2902362-49" + }, + { + "extra": { + "doi": "10.1109/msr.2015.38", + "unstructured": "White, M., Vendome, C., Linares-Vásquez, M., Poshyvanyk, D. Toward deep learning software repositories. InMSR, 2015." + }, + "index": 49, + "key": "key-10.1145/2902362-50" + }, + { + "extra": { + "unstructured": "Xie, T., Thummalapenta, S., Lo, D., Liu, C. Data mining for software engineering.IEEE Comput. 42, 8 (2009)." + }, + "index": 50, + "key": "key-10.1145/2902362-51" + }, + { + "extra": { + "doi": "10.1109/icse.2004.1317478", + "unstructured": "Zimmermann, T., Weisgerber, P., Diehl, S., Zeller, A. Mining version histories to guide software changes. InICSE.IEEE Computer Society, 2004." + }, + "index": 51, + "key": "key-10.1145/2902362-52" + } + ], + "release_date": "2016-04-26", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2016, + "revision": "704dbdea-eff5-4c8c-82bb-3c11dd4ca877", + "state": "active", + "title": "On the naturalness of software", + "volume": "59", + "work_id": "ga3y432u4vfabmbkkou4rnwf2i" +} diff --git a/tests/data/release/r65jidi26jbpnf2eafbo3qi5am b/tests/data/release/r65jidi26jbpnf2eafbo3qi5am new file mode 100644 index 0000000..55f09d5 --- /dev/null +++ b/tests/data/release/r65jidi26jbpnf2eafbo3qi5am @@ -0,0 +1,51 @@ +{ + "abstracts": [], + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Abram Hindle", + "role": "author" + }, + { + "index": 1, + "raw_name": "Earl T. Barr", + "role": "author" + }, + { + "index": 2, + "raw_name": "Zhendong Su", + "role": "author" + }, + { + "index": 3, + "raw_name": "Mark Gabel", + "role": "author" + }, + { + "index": 4, + "raw_name": "Premkumar Devanbu", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1109/icse.2012.6227135" + }, + "extra": { + "container_name": "2012 34th International Conference on Software Engineering (ICSE)", + "crossref": { + "type": "proceedings-article" + } + }, + "ident": "r65jidi26jbpnf2eafbo3qi5am", + "publisher": "IEEE", + "refs": [], + "release_type": "paper-conference", + "release_year": 2012, + "revision": "997b7888-65c7-4590-b85e-b80665c1359f", + "state": "active", + "title": "On the naturalness of software", + "work_id": "ewpldvhthbbdrilubzhignz2mu" +} diff --git a/tests/data/release/zh4tnlduu5bmngeipxaftaqwqq b/tests/data/release/zh4tnlduu5bmngeipxaftaqwqq new file mode 100644 index 0000000..3b51185 --- /dev/null +++ b/tests/data/release/zh4tnlduu5bmngeipxaftaqwqq @@ -0,0 +1,31 @@ +{ + "abstracts": [], + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Prem Devanbu", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1145/2442754.2442763" + }, + "extra": { + "container_name": "Proceedings of the 6th India Software Engineering Conference on - ISEC '13", + "crossref": { + "type": "proceedings-article" + } + }, + "ident": "zh4tnlduu5bmngeipxaftaqwqq", + "publisher": "ACM Press", + "refs": [], + "release_type": "paper-conference", + "release_year": 2013, + "revision": "c6a978b2-c79e-4241-8bbe-0cefd2ac8370", + "state": "active", + "title": "On the naturalness of software", + "work_id": "hi4bcgrlofbu3mflibl2ou4444" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index cd02a34..8e45307 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -190,3 +190,6 @@ s5a6e6wnlvdelge256xpha6oqu,zoeto2mymzhi3l74fr2ps5qjyy,, 4zsgxrjpzzg5hbffh4ad4a774a,myayyezdibc2flxxspaflymmu4,Status.EXACT,WORK_ID 4zsgxrjpzzg5hbffh4ad4a774a,v2dwuua2rbhyjee2mjvumrdjry,Status.EXACT,TITLE_AUTHOR_MATCH myayyezdibc2flxxspaflymmu4,v2dwuua2rbhyjee2mjvumrdjry,Status.EXACT,TITLE_AUTHOR_MATCH +osgdmzg32jcr5ngh5p4laeeira,r65jidi26jbpnf2eafbo3qi5am,Status.EXACT,TITLE_AUTHOR_MATCH +osgdmzg32jcr5ngh5p4laeeira,zh4tnlduu5bmngeipxaftaqwqq,Status.DIFFERENT,YEAR +r65jidi26jbpnf2eafbo3qi5am,zh4tnlduu5bmngeipxaftaqwqq,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY |