aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-23 19:35:23 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-23 19:36:23 -0700
commit98e67e291132b10a0ca698ad4ff754acc0c22121 (patch)
tree940d436787c14548eacf6b86d3670a9f3ef3b1ad
parent715a35715609d8cbacff53dd5c7c1715c53a55f8 (diff)
downloadsandcrawler-98e67e291132b10a0ca698ad4ff754acc0c22121.tar.gz
sandcrawler-98e67e291132b10a0ca698ad4ff754acc0c22121.zip
add a content-type filter for crossref works
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala18
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala9
2 files changed, 26 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index baa1ca9..039fa85 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -26,6 +26,21 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
}
object CrossrefScorable {
+
+ val ContentTypeWhitelist: Set[String] = Set(
+ "book",
+ "book-chapter",
+ "dataset",
+ "dissertation",
+ "journal-article",
+ "letter",
+ "monograph",
+ "posted-content",
+ "pre-print",
+ "proceedings-article",
+ "report",
+ "working-paper")
+
def keepRecord(json : String) : Boolean = {
Scorable.jsonToMap(json) match {
case None => false
@@ -90,7 +105,8 @@ object CrossrefScorable {
val doi = Scorable.getString(map, "DOI")
val authors: List[String] = mapToAuthorList(map)
val year: Int = mapToYear(map).getOrElse(0)
- if (doi.isEmpty || doi == null || authors.length == 0) {
+ val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
+ if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
MapFeatures(Scorable.NoSlug, json)
} else {
val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 0cb12ee..f598cae 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -74,6 +74,8 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author")
+ val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+ val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
// Unit tests
"CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
@@ -139,4 +141,11 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
it should "return false for invalid JSON" in {
CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
}
+
+ it should "handle content types" in {
+ val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType)
+ resultWrong.slug shouldBe Scorable.NoSlug
+ val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType)
+ resultMissing.slug shouldBe Scorable.NoSlug
+ }
}