diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 19:35:23 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 19:36:23 -0700 | 
| commit | 98e67e291132b10a0ca698ad4ff754acc0c22121 (patch) | |
| tree | 940d436787c14548eacf6b86d3670a9f3ef3b1ad /scalding | |
| parent | 715a35715609d8cbacff53dd5c7c1715c53a55f8 (diff) | |
| download | sandcrawler-98e67e291132b10a0ca698ad4ff754acc0c22121.tar.gz sandcrawler-98e67e291132b10a0ca698ad4ff754acc0c22121.zip | |
add a content-type filter for crossref works
Diffstat (limited to 'scalding')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 18 | ||||
| -rw-r--r-- | scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala | 9 | 
2 files changed, 26 insertions, 1 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index baa1ca9..039fa85 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -26,6 +26,21 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {  }  object CrossrefScorable { + +  val ContentTypeWhitelist: Set[String] = Set( +    "book", +    "book-chapter", +    "dataset", +    "dissertation", +    "journal-article", +    "letter", +    "monograph", +    "posted-content", +    "pre-print", +    "proceedings-article", +    "report", +    "working-paper") +    def keepRecord(json : String) : Boolean = {      Scorable.jsonToMap(json) match {        case None => false @@ -90,7 +105,8 @@ object CrossrefScorable {              val doi = Scorable.getString(map, "DOI")              val authors: List[String] = mapToAuthorList(map)              val year: Int = mapToYear(map).getOrElse(0) -            if (doi.isEmpty || doi == null || authors.length == 0) { +            val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE") +            if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {                MapFeatures(Scorable.NoSlug, json)              } else {                val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year) diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index 0cb12ee..f598cae 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -74,6 +74,8 @@ class CrossrefScorableTest extends FlatSpec with Matchers {    val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")    val MalformedCrossrefString = CrossrefString.replace("}", "")    val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author") +  val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other") +  val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")    // Unit tests    "CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in { @@ -139,4 +141,11 @@ class CrossrefScorableTest extends FlatSpec with Matchers {    it should "return false for invalid JSON" in {      CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false    } + +  it should "handle content types" in { +    val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType) +    resultWrong.slug shouldBe Scorable.NoSlug +    val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType) +    resultMissing.slug shouldBe Scorable.NoSlug +  }  } | 
