aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-03 15:49:25 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-03 15:49:25 -0700
commitd7e802610baf6b14409bfde9e345968aed08a641 (patch)
tree23cdb9452e57de71170551808c256adfc7a24a3f
parent2f4cff0265c73f4e4b975176de118bf767d44cd2 (diff)
downloadfatcat-scholar-d7e802610baf6b14409bfde9e345968aed08a641.tar.gz
fatcat-scholar-d7e802610baf6b14409bfde9e345968aed08a641.zip
start fleshing out /about and /help
-rw-r--r--fatcat_scholar/templates/about.html58
-rw-r--r--fatcat_scholar/templates/base.html20
-rw-r--r--fatcat_scholar/templates/help.html81
-rw-r--r--fatcat_scholar/web.py5
4 files changed, 152 insertions, 12 deletions
diff --git a/fatcat_scholar/templates/about.html b/fatcat_scholar/templates/about.html
index ad984ce..59a01ea 100644
--- a/fatcat_scholar/templates/about.html
+++ b/fatcat_scholar/templates/about.html
@@ -1,7 +1,61 @@
{% extends "base.html" %}
{% block main %}
-<h1>Some About Content</h1>
+<h1>About Scholarly Search</h1>
+<p><i>See also: <a href="{{ lang_prefix }}/help">User Guide</a> which lists
+some bugs and known issues</i>
+
+<a name="howitworks"></a>
+<h3>How It Works</h3>
+
+<p>Content in this search index comes from preservation copies at the Internet
+Archive in one of three forms:
+
+<ul>
+ <li><b>public web content</b> in the Wayback Machine web archives
+ (web.archive.org), either identified from historic collecting, crawled
+ specifically to ensure long-term access to scholarly materials, or crawled at
+ the direction of our Archive-It partners
+ <li><b>digitized print material</b> from paper and microform collections
+ purchased and scanned by Internet Archive or our partners
+ <li><b>general materials</b> on the archive.org collections, including
+ content from partner organizations, uploads from the general public, and
+ mirrors of other projects
+</ul>
+
+<p>This <a href="https://www.youtube.com/watch?v=PARqfbYIdXQ">2019 FORCE11
+conference presentation</a> gives an overview of the technical infrastructure
+and goals of the project overall.
+
+<a name="sources"></a>
+<h3>Content Sources</h3>
+
+<p>Metadata comes from <a href="https://fatcat.wiki">fatcat.wiki</a>, an open
+user-editable catalog of scholarly work. It should be possible to track and
+attribute the provenance of content and metadata in all cases; please contact
+us if you have questions or concenrs.
+
+<a name="tdm"></a>
+<h3>Text and Data Mining</h3>
+
+<p>We intend to provide researcher access to the full corpus for text and data
+mining purposes. Derived datasets may also be posted publicly for analysis, for
+example a citation graph or N-gram frequencies by year. If you are interested
+or would like to see specific datasets made available, please contact us.
+
+<p>Currently snapshots of the full fatcat metadata corpus and upstream metadata
+sources are uploaded periodically to the
+<a href="https://archive.org/details/ia_biblio_metadata">Bulk Bibliographic
+Metadata</a> collection on archive.org.
+Read more in <a href="https://guide.fatcat.wiki/bulk_exports.html">the Fatcat Guide</a>.
+
+<a name="contact"></a>
+<h3>Contact Information</h3>
+
+<p>The organizational contact information for The Internet Archive is listed at
+<a href="https://archive.org/about/contact.php">https://archive.org/about/contact.php</a>.
+Queries about this search service and the fatcat catalog can be directed to
+<a href="mailto:webservices@archive.org">webservices@archive.org</a>.
+
-<p>Lorem ipsum
{% endblock %}
diff --git a/fatcat_scholar/templates/base.html b/fatcat_scholar/templates/base.html
index e8969d0..b7ee461 100644
--- a/fatcat_scholar/templates/base.html
+++ b/fatcat_scholar/templates/base.html
@@ -153,7 +153,7 @@
</div>
</div>
<div class="ui twelve wide column">
- <form class="" id="search_form" action="{{ lang_prefix }}/search") }}" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction">
+ <form class="" id="search_form" action="{{ lang_prefix }}/search" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction">
<meta itemprop="target" content="https://{{ settings.SCHOLAR_DOMAIN }}/fulltext/search?q={q}"/>
<div class="ui form">
<div class="ui action input large fluid">
@@ -167,7 +167,7 @@
<div style="display: flex; width: 100%; justify-content: space-between; padding-top: 0.5em;">
<div></div>
<div>
- <a href="#" style="order: -1;">{{ _("User Guide") }}</a>
+ <a href="{{ lang_prefix }}/help" style="order: -1;">{{ _("User Guide") }}</a>
</div>
</div>
</div>
@@ -207,20 +207,20 @@
<div class="three wide column">
<h4 class="ui inverted header">scholar.archive.org</h4>
<div class="ui inverted link list">
- <a href="#" class="item">How It Works</a>
- <a href="#" class="item">Content Sources</a>
- <a href="#" class="item">Text and Data Mining</a>
- <a href="#" class="item">Discussion Forum</a>
- <a href="#" class="item">Statistics</a>
+ <a href="{{ lang_prefix }}/about#howitworks" class="item">{{ _("How It Works") }}</a>
+ <a href="{{ lang_prefix }}/about#sources" class="item">{{ _("Content Sources") }}</a>
+ <a href="{{ lang_prefix }}/about#tdm" class="item">{{ _("Text and Data Mining") }}</a>
+ <a href="#" class="item">{{ _("Discussion Forum") }}</a>
+ <a href="#" class="item">{{ _("Statistics") }}</a>
<a href="/help" class="item">{{ _("Help") }}</a>
</div>
</div>
<div class="three wide column">
<h4 class="ui inverted header">Open Infrastructure</h4>
<div class="ui inverted link list">
- <a target="_blank" href="https://fatcat.wiki" class="item">Editable Catalog (Fatcat)</a>
- <a target="_blank" href="https://guide.fatcat.wiki" class="item">Contribute</a>
- <a target="_blank" href="/api/redoc" class="item">Search API</a>
+ <a target="_blank" href="https://fatcat.wiki" class="item">{{ _("Editable Catalog (Fatcat)") }}</a>
+ <a target="_blank" href="https://guide.fatcat.wiki" class="item">{{ _("Contribute") }}</a>
+ <a target="_blank" href="/api/redoc" class="item">{{ _("Search API") }}</a>
<a target="_blank" href="https://status.fatcat.wiki" class="item">{{ _("Service Status") }}</a>
<a target="_blank" href="https://github.com/internetarchive/fatcat-scholar" class="item">{{ _("Source Code") }}</a>
</div>
diff --git a/fatcat_scholar/templates/help.html b/fatcat_scholar/templates/help.html
new file mode 100644
index 0000000..f5486b3
--- /dev/null
+++ b/fatcat_scholar/templates/help.html
@@ -0,0 +1,81 @@
+{% extends "base.html" %}
+
+{% macro example_search_box(query) -%}
+<form class="" id="" action="{{ lang_prefix }}/search" method="get" role="search" aria-label="papers" itemprop="potentialAction" itemscope itemtype="https://schema.org/SearchAction">
+ <meta itemprop="target" content="https://{{ settings.SCHOLAR_DOMAIN }}/fulltext/search?q={q}"/>
+ <div class="ui form">
+ <div class="ui action input large fluid">
+ <input type="search" value="{{ query }}" name="q" aria-label="search metadata" required itemprop="query-input" style="border-radius: 0; border: 1px #999 solid;">
+ <button class="ui green button" style="border-radius: 0; background-color: grey; font-size: 1.2rem;">{{ _("Try It") }}</button>
+ </div>
+ </div>
+</form>
+<br>
+{% endmacro %}
+
+{% block main %}
+<h1>Scholar Search User Guide</h1>
+<p><i>See also: <a href="{{ lang_prefix }}/about">About Scholarly Search</a></i>
+
+<p>In addition to the basic filtering and sorting options, this search
+interface also allows the use of Lucene query syntax in the search box. You can
+restrict term queries on multiple metadata fields using colon statements like
+<code>journal:Science</code>, set filters like <code>lang:de</code>, and
+apply range queries like <code>year:&gt;1989 year:&lt;2000</code>.
+
+
+<h3>Example Queries</h3>
+
+<p>Search for digitized pages about a topic from specific years:
+
+{{ example_search_box('"egyptian pyramid" access_type:ia_sim year:<2000') }}
+
+<p>Search for papers in Chinese matching a term:
+
+{{ example_search_box('lang:zh 临床表现多样') }}
+
+<p>Conference papers with an author name query:
+
+{{ example_search_box('type:paper-conference author:"natasha noy"') }}
+
+<h3>Details</h3>
+
+<p>A partial list of metadata fields is:
+
+<ul>
+ <li>title
+ <li>author
+ <li>journal
+ <li>year
+ <li>issue
+ <li>volume
+ <li>doi
+ <li>type (eg, "article-journal", "dataset", "book")
+ <li>stage (eg, "published", "submitted", "accepted", "draft")
+ <li>lang (value is a 2-character lower-case ISO lanuage code)
+ <li>country (value is a 2-character lower-case ISO country code)
+ <li>access_type (eg, "wayback", "ia_file", "ia_sim")
+ <li>tag
+</ul>
+
+<p>You can restrict to records where the field exists with an asterisk like
+<code>doi:*</code>, and negate any term like
+<code>!type:article-journal</code>.
+
+<p>In-depth documentation of the query syntax is available <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-query-notes">from the open source project</a>. The complete current search document schema can be fetched (in JSON format) <a href="https://search.fatcat.wiki/qa_scholar_fulltext/_mapping">from the search index itself</a>.
+
+
+<h3>Known Issues</h3>
+
+<p>This project is currently a <i>prototype</i>, with only a limited amount of
+content indexed.
+
+<p>Some known bugs and issues:
+
+<ul>
+ <li>web.archive.org PDF links sometimes return "not found" errors. This is impacting up to 1% of recent papers. In almost all cases there is a preserved copy of the file that should be available.
+ <li>Poor metadata quality for conference proceedings. Many are labeled "unpublished" and are not associated with
+ <li>Duplicate versions of same work. Eg, different versions of the same paper or dataset. We are working on basic entity-deduplication in the fatcat catalog.
+ <li>Mis-matching of file content or version with work metadata. For example, sometimes pre-prints or author manuscripts are incorrectly associated with version-of-record metadata, or vica-versa.
+</ul>
+{% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index e2fde9b..2fd8b24 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -126,6 +126,11 @@ async def web_about(request: Request, lang: LangPrefix = Depends(LangPrefix)):
return i18n_templates[lang.code].TemplateResponse("about.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix})
+@web.get("/help", include_in_schema=False)
+async def web_help(request: Request, lang: LangPrefix = Depends(LangPrefix)):
+ return i18n_templates[lang.code].TemplateResponse("help.html", {"request": request, "locale": lang.code, "lang_prefix": lang.prefix})
+
+
@web.get("/search", include_in_schema=False)
async def web_search(request: Request, query: FulltextQuery = Depends(FulltextQuery), lang: LangPrefix = Depends(LangPrefix), content: ContentNegotiation = Depends(ContentNegotiation)):