diff options
| -rw-r--r-- | scratch/.gitignore | 2 | ||||
| -rw-r--r-- | scratch/HelloWorld.ipynb | 362 | ||||
| -rw-r--r-- | scratch/README.md | 10 | 
3 files changed, 374 insertions, 0 deletions
diff --git a/scratch/.gitignore b/scratch/.gitignore new file mode 100644 index 0000000..01e0dc1 --- /dev/null +++ b/scratch/.gitignore @@ -0,0 +1,2 @@ +.env +.ipynb_checkpoints/ diff --git a/scratch/HelloWorld.ipynb b/scratch/HelloWorld.ipynb new file mode 100644 index 0000000..e382aa4 --- /dev/null +++ b/scratch/HelloWorld.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ +  { +   "cell_type": "code", +   "execution_count": 11, +   "id": "6974daac", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "from pyspark.sql import SparkSession\n", +    "from pyspark.context import SparkContext" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 12, +   "id": "e72fbe47", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "pyspark.sql.session.SparkSession" +      ] +     }, +     "execution_count": 12, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "SparkSession" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 13, +   "id": "9333e07a", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "spark = SparkSession.builder.master(\"local\").appName(\"HelloWorld\").getOrCreate()" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 14, +   "id": "adf79021", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/html": [ +       "\n", +       "            <div>\n", +       "                <p><b>SparkSession - hive</b></p>\n", +       "                \n", +       "        <div>\n", +       "            <p><b>SparkContext</b></p>\n", +       "\n", +       "            <p><a href=\"http://192.168.179.73:4040\">Spark UI</a></p>\n", +       "\n", +       "            <dl>\n", +       "              <dt>Version</dt>\n", +       "                <dd><code>v3.1.1</code></dd>\n", +       "              <dt>Master</dt>\n", +       "                <dd><code>local[*]</code></dd>\n", +       "              <dt>AppName</dt>\n", +       "                <dd><code>PySparkShell</code></dd>\n", +       "            </dl>\n", +       "        </div>\n", +       "        \n", +       "            </div>\n", +       "        " +      ], +      "text/plain": [ +       "<pyspark.sql.session.SparkSession at 0x7f63b4051650>" +      ] +     }, +     "execution_count": 14, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "spark" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 15, +   "id": "324b74ca", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "'3.1.1'" +      ] +     }, +     "execution_count": 15, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "spark.version" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 17, +   "id": "856f2700", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "sc = SparkContext.getOrCreate()" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 18, +   "id": "7218a07f", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/html": [ +       "\n", +       "        <div>\n", +       "            <p><b>SparkContext</b></p>\n", +       "\n", +       "            <p><a href=\"http://192.168.179.73:4040\">Spark UI</a></p>\n", +       "\n", +       "            <dl>\n", +       "              <dt>Version</dt>\n", +       "                <dd><code>v3.1.1</code></dd>\n", +       "              <dt>Master</dt>\n", +       "                <dd><code>local[*]</code></dd>\n", +       "              <dt>AppName</dt>\n", +       "                <dd><code>PySparkShell</code></dd>\n", +       "            </dl>\n", +       "        </div>\n", +       "        " +      ], +      "text/plain": [ +       "<SparkContext master=local[*] appName=PySparkShell>" +      ] +     }, +     "execution_count": 18, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "sc" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 23, +   "id": "17114da3", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "txt = sc.textFile('file:////usr/share/doc/python3/copyright')" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 24, +   "id": "c1af9b77", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "file:////usr/share/doc/python3/copyright MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:0" +      ] +     }, +     "execution_count": 24, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "txt" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 25, +   "id": "ee37564c", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "pyspark.rdd.RDD" +      ] +     }, +     "execution_count": 25, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "type(txt)" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 26, +   "id": "208ef97a", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "319" +      ] +     }, +     "execution_count": 26, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "txt.count()" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 27, +   "id": "06f807b8", +   "metadata": {}, +   "outputs": [], +   "source": [ +    "python_lines = txt.filter(lambda line: 'python' in line.lower())" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 28, +   "id": "ff2835b7", +   "metadata": {}, +   "outputs": [ +    { +     "name": "stdout", +     "output_type": "stream", +     "text": [ +      "52\n" +     ] +    } +   ], +   "source": [ +    "print(python_lines.count())" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": 30, +   "id": "9c8ec8c7", +   "metadata": {}, +   "outputs": [ +    { +     "data": { +      "text/plain": [ +       "['This is the Debian GNU/Linux prepackaged version of the Python programming',\n", +       " 'language. Python was written by Guido van Rossum <guido@cwi.nl> and others.',\n", +       " 'sources from ftp.python.org:/pub/python, based on the Debianization by',\n", +       " 'Python was created in the early 1990s by Guido van Rossum at Stichting',\n", +       " \"as a successor of a language called ABC.  Guido remains Python's\",\n", +       " 'In 1995, Guido continued his work on Python at the Corporation for',\n", +       " 'In May 2000, Guido and the Python core development team moved to',\n", +       " 'BeOpen.com to form the BeOpen PythonLabs team.  In October of the same',\n", +       " 'year, the PythonLabs team moved to Digital Creations (now Zope',\n", +       " 'Corporation, see http://www.zope.com).  In 2001, the Python Software',\n", +       " 'Foundation (PSF, see http://www.python.org/psf/) was formed, a',\n", +       " 'non-profit organization created specifically to own Python-related',\n", +       " 'All Python releases are Open Source (see http://www.opensource.org for',\n", +       " 'the Open Source Definition).  Historically, most, but not all, Python',\n", +       " \"(1) GPL-compatible doesn't mean that we're distributing Python under\",\n", +       " '    the GPL.  All Python licenses, unlike the GPL, let you distribute',\n", +       " '    GPL-compatible licenses make it possible to combine Python with',\n", +       " 'B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON',\n", +       " 'PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2',\n", +       " '1. This LICENSE AGREEMENT is between the Python Software Foundation',\n", +       " 'otherwise using this software (\"Python\") in source or binary form and',\n", +       " 'distribute, and otherwise use Python alone or in any derivative version,',\n", +       " 'Python Software Foundation; All Rights Reserved\" are retained in Python alone or',\n", +       " 'or incorporates Python or any part thereof, and wants to make',\n", +       " 'the changes made to Python.',\n", +       " '4. PSF is making Python available to Licensee on an \"AS IS\"',\n", +       " 'FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT',\n", +       " '5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON',\n", +       " 'A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,',\n", +       " '8. By copying, installing or otherwise using Python, Licensee',\n", +       " 'BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0',\n", +       " 'BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1',\n", +       " '2. Subject to the terms and conditions of this BeOpen Python License',\n", +       " 'provided, however, that the BeOpen Python License is retained in the',\n", +       " 'third party.  As an exception, the \"BeOpen Python\" logos available at',\n", +       " 'http://www.pythonlabs.com/logos.html may be used according to the',\n", +       " 'CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1',\n", +       " '(\"Licensee\") accessing and otherwise using Python 1.6.1 software in',\n", +       " 'prepare derivative works, distribute, and otherwise use Python 1.6.1',\n", +       " 'Reserved\" are retained in Python 1.6.1 alone or in any derivative',\n", +       " 'quotes): \"Python 1.6.1 is made available subject to the terms and',\n", +       " 'Python 1.6.1 may be located on the Internet using the following',\n", +       " 'or incorporates Python 1.6.1 or any part thereof, and wants to make',\n", +       " 'the changes made to Python 1.6.1.',\n", +       " '4. CNRI is making Python 1.6.1 available to Licensee on an \"AS IS\"',\n", +       " 'FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT',\n", +       " '5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON',\n", +       " 'A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,',\n", +       " 'on Python 1.6.1 that incorporate non-separable material that was',\n", +       " 'installing or otherwise using Python 1.6.1, Licensee agrees to be',\n", +       " 'CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2',\n", +       " 'py3compile, py3clean and debpython module:']" +      ] +     }, +     "execution_count": 30, +     "metadata": {}, +     "output_type": "execute_result" +    } +   ], +   "source": [ +    "python_lines.collect()" +   ] +  }, +  { +   "cell_type": "code", +   "execution_count": null, +   "id": "9c60ff17", +   "metadata": {}, +   "outputs": [], +   "source": [] +  } + ], + "metadata": { +  "kernelspec": { +   "display_name": "cgraph", +   "language": "python", +   "name": "cgraph" +  }, +  "language_info": { +   "codemirror_mode": { +    "name": "ipython", +    "version": 3 +   }, +   "file_extension": ".py", +   "mimetype": "text/x-python", +   "name": "python", +   "nbconvert_exporter": "python", +   "pygments_lexer": "ipython3", +   "version": "3.7.8" +  } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scratch/README.md b/scratch/README.md new file mode 100644 index 0000000..4c3fa65 --- /dev/null +++ b/scratch/README.md @@ -0,0 +1,10 @@ +# PySpark Test Run + +* 2020-04-02 + +Goal: We want to understand, which URLs of the citation corpus have been +preserved. Also we want the GWB URL if possible. We'll try pyspark. + +Our cluster runs Hadoop 2.6, so we'll try: + +    $ PYSPARK_HADOOP_VERSION=2.7 pip install pyspark  | 
