{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# Matrices en 3 colonnes\n", "\n", "Repr\u00e9sentation d'une matrice avec Spark / Map / Reduce."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n", "<script>\n", "function repeat_indent_string(n){\n", "    var a = \"\" ;\n", "    for ( ; n > 0 ; --n) {\n", "        a += \"    \";\n", "    }\n", "    return a;\n", "}\n", "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item) {\n", "    var anchors = document.getElementsByClassName(\"section\");\n", "    if (anchors.length == 0) {\n", "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n", "    }\n", "    var i,t;\n", "    var text_menu = begin;\n", "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n", "    var ind = \"\";\n", "    var memo_level = 1;\n", "    var href;\n", "    var tags = [];\n", "    var main_item = 0;\n", "    for (i = 0; i <= llast; i++) {\n", "        tags.push(\"h\" + i);\n", "    }\n", "\n", "    for (i = 0; i < anchors.length; i++) {\n", "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n", "\n", "        var child = null;\n", "        for(t = 0; t < tags.length; t++) {\n", "            var r = anchors[i].getElementsByTagName(tags[t]);\n", "            if (r.length > 0) {\n", "child = r[0];\n", "break;\n", "            }\n", "        }\n", "        if (child == null){\n", "            text_memo += \"null\\n\";\n", "            continue;\n", "        }\n", "        if (anchors[i].hasAttribute(\"id\")) {\n", "            // when converted in RST\n", "            href = anchors[i].id;\n", "            text_memo += \"#1-\" + href;\n", "            // passer \u00e0 child suivant (le chercher)\n", "        }\n", "        else if (child.hasAttribute(\"id\")) {\n", "            // in a notebook\n", "            href = child.id;\n", "            text_memo += \"#2-\" + href;\n", "        }\n", "        else {\n", "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n", "            continue;\n", "        }\n", "        var title = child.textContent;\n", "        var level = parseInt(child.tagName.substring(1,2));\n", "\n", "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n", "\n", "        if ((level < lfirst) || (level > llast)) {\n", "            continue ;\n", "        }\n", "        if (title.endsWith('\u00b6')) {\n", "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\").replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\")\n", "        }\n", "\n", "        if (title.length == 0) {\n", "            continue;\n", "        }\n", "\n", "        while (level < memo_level) {\n", "            text_menu += \"</ul>\\n\";\n", "            memo_level -= 1;\n", "        }\n", "        if (level == lfirst) {\n", "            main_item += 1;\n", "        }\n", "        if (keep_item != -1 && main_item != keep_item + 1) {\n", "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n", "            continue;\n", "        }\n", "        while (level > memo_level) {\n", "            text_menu += \"<ul>\\n\";\n", "            memo_level += 1;\n", "        }\n", "        text_menu += repeat_indent_string(level-2) + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n", "    }\n", "    while (1 < memo_level) {\n", "        text_menu += \"</ul>\\n\";\n", "        memo_level -= 1;\n", "    }\n", "    text_menu += send;\n", "    //text_menu += \"\\n\" + text_memo;\n", "    return text_menu;\n", "};\n", "var update_menu = function() {\n", "    var sbegin = \"\";\n", "    var sformat = '<li><a href=\"#__HREF__\">__TITLE__</a></li>';\n", "    var send = \"\";\n", "    var keep_item = -1;\n", "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item);\n", "    var menu = document.getElementById(\"my_id_menu_nb\");\n", "    menu.innerHTML=text_menu;\n", "};\n", "window.setTimeout(update_menu,2000);\n", "            </script>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Ce notebook propose d'impl\u00e9menter un produit matriciel sous Spark. Spark comme SQL n'aime pas trop avoir un nombre de colonnes variables. La premi\u00e8re \u00e9tape consiste \u00e0 transformer les matrices $I\\times J$ en tableau de trois colonnes $(i,j,coefficient)$."]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Cr\u00e9ation d'une matrice al\u00e9atoire"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/plain": ["array([[ 3.16485005,  4.55944572],\n", "       [ 2.5267903 ,  2.87023564],\n", "       [ 2.54568848,  3.04477188],\n", "       [ 1.97955889,  2.62345798],\n", "       [ 2.62883641,  4.11144949],\n", "       [ 2.78826895,  3.49846327],\n", "       [ 2.21395203,  2.90355441],\n", "       [ 2.23250159,  3.21322361],\n", "       [ 2.42321287,  3.4138286 ],\n", "       [ 2.13862033,  3.38881814]])"]}, "execution_count": 3, "metadata": {}, "output_type": "execute_result"}], "source": ["from numpy.random import rand\n", "rnd1 = rand(10,10)\n", "rnd2 = rand(10, 2)\n", "rnd1 @ rnd2"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>0</th>\n", "      <th>1</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>0.797985</td>\n", "      <td>0.429407</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>0.621650</td>\n", "      <td>0.495996</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>0.388457</td>\n", "      <td>0.513059</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>0.481193</td>\n", "      <td>0.758338</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>0.704995</td>\n", "      <td>0.846644</td>\n", "    </tr>\n", "    <tr>\n", "      <th>5</th>\n", "      <td>0.338331</td>\n", "      <td>0.795261</td>\n", "    </tr>\n", "    <tr>\n", "      <th>6</th>\n", "      <td>0.168499</td>\n", "      <td>0.982211</td>\n", "    </tr>\n", "    <tr>\n", "      <th>7</th>\n", "      <td>0.149587</td>\n", "      <td>0.408392</td>\n", "    </tr>\n", "    <tr>\n", "      <th>8</th>\n", "      <td>0.964368</td>\n", "      <td>0.393823</td>\n", "    </tr>\n", "    <tr>\n", "      <th>9</th>\n", "      <td>0.106360</td>\n", "      <td>0.981409</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["          0         1\n", "0  0.797985  0.429407\n", "1  0.621650  0.495996\n", "2  0.388457  0.513059\n", "3  0.481193  0.758338\n", "4  0.704995  0.846644\n", "5  0.338331  0.795261\n", "6  0.168499  0.982211\n", "7  0.149587  0.408392\n", "8  0.964368  0.393823\n", "9  0.106360  0.981409"]}, "execution_count": 4, "metadata": {}, "output_type": "execute_result"}], "source": ["import pandas\n", "df1 = pandas.DataFrame(rnd1)\n", "df2 = pandas.DataFrame(rnd2)\n", "df2"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": ["df1.to_csv(\"rnd1.txt\", sep=\"\\t\", header=None, index=False)\n", "df2.to_csv(\"rnd2.txt\", sep=\"\\t\", header=None, index=False)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Conversion d'une matrice au format Spark\n", "\n", "Lorsqu'un traitement est distribu\u00e9 en Map/Reduce, il n'est pas possible de s'appuyer sur l'ordre dans lequel sont trait\u00e9es les lignes. Le plus est d'ajouter cette information sur chaque ligne plut\u00f4t que de chercher \u00e0 la r\u00e9cup\u00e9rer."]}, {"cell_type": "code", "execution_count": 5, "metadata": {"collapsed": true}, "outputs": [], "source": ["df1.to_csv(\"rnd1.txt\", sep=\"\\t\", header=None, index=True)\n", "df2.to_csv(\"rnd2.txt\", sep=\"\\t\", header=None, index=True)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {"collapsed": true}, "outputs": [], "source": ["def process_mat_row(row):\n", "    values = row.split(\"\\t\")\n", "    index = int(values[0])\n", "    values = [float(_) for _ in values[1:]]\n", "    return [[index, j, v] for j, v in enumerate(values)]"]}, {"cell_type": "code", "execution_count": 7, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/plain": ["[[0, 0, 0.9644291538662529],\n", " [0, 1, 0.4373852433376806],\n", " [0, 2, 0.9381996928112303],\n", " [0, 3, 0.9667835285319625],\n", " [0, 4, 0.4641212389055033],\n", " [0, 5, 0.9623402255972683],\n", " [0, 6, 0.5503835704242525],\n", " [0, 7, 0.44419152080661695],\n", " [0, 8, 0.4246234335886486],\n", " [0, 9, 0.6790142143195625],\n", " [1, 0, 0.8978778490345252],\n", " [1, 1, 0.4449340737705302]]"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["mat1 = sc.textFile(\"rnd1.txt\")\n", "new_mat1 = mat1.flatMap(process_mat_row)\n", "new_mat1.take(12)"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/plain": ["[[0, 0, 0.7979848790126467],\n", " [0, 1, 0.42940652289456605],\n", " [1, 0, 0.6216501174339032],\n", " [1, 1, 0.49599627482280284],\n", " [2, 0, 0.3884569290726675],\n", " [2, 1, 0.5130585892599168],\n", " [3, 0, 0.4811927042491243],\n", " [3, 1, 0.7583376223390912],\n", " [4, 0, 0.7049954049585642],\n", " [4, 1, 0.8466443457915623],\n", " [5, 0, 0.3383307412367995],\n", " [5, 1, 0.7952613070751512]]"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["mat2 = sc.textFile(\"rnd2.txt\")\n", "new_mat2 = mat2.flatMap(process_mat_row)\n", "new_mat2.take(12)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Produit matriciel\n", "\n", "Il faut d'abord faire la jointure avec la m\u00e9thode [join](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.join). Il faut que la cl\u00e9 soit sur la premi\u00e8re colonne."]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"data": {"text/plain": ["[(0, ((0, 0.9644291538662529), (0, 0.7979848790126467))),\n", " (0, ((0, 0.9644291538662529), (1, 0.42940652289456605))),\n", " (0, ((1, 0.8978778490345252), (0, 0.7979848790126467))),\n", " (0, ((1, 0.8978778490345252), (1, 0.42940652289456605))),\n", " (0, ((2, 0.9974373360415959), (0, 0.7979848790126467))),\n", " (0, ((2, 0.9974373360415959), (1, 0.42940652289456605))),\n", " (0, ((3, 0.6998511854857753), (0, 0.7979848790126467))),\n", " (0, ((3, 0.6998511854857753), (1, 0.42940652289456605))),\n", " (0, ((4, 0.7225040569303974), (0, 0.7979848790126467))),\n", " (0, ((4, 0.7225040569303974), (1, 0.42940652289456605))),\n", " (0, ((5, 0.6833261146005197), (0, 0.7979848790126467))),\n", " (0, ((5, 0.6833261146005197), (1, 0.42940652289456605)))]"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["def key_ij(row):\n", "    return row[0], (row[1], row[2])\n", "def key_ji(row):\n", "    return row[1], (row[0], row[2])\n", "mat_join = new_mat1.map(key_ji).join(new_mat2.map(key_ij))\n", "mat_join.take(12)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On effectue le produit matriciel."]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [{"data": {"text/plain": ["[(0, 0, 0.7695998816642311),\n", " (0, 1, 0.4141321695398561),\n", " (1, 0, 0.716492946729951),\n", " (1, 1, 0.3855546051379675),\n", " (2, 0, 0.7959399119238495),\n", " (2, 1, 0.4283060982748405),\n", " (3, 0, 0.5584706635767238),\n", " (3, 1, 0.30052066410308675),\n", " (4, 0, 0.5765473124557496),\n", " (4, 1, 0.31024795486369955),\n", " (5, 0, 0.5452839068856777),\n", " (5, 1, 0.29342469087366296)]"]}, "execution_count": 11, "metadata": {}, "output_type": "execute_result"}], "source": ["def produit_matriciel(row):\n", "    index, ((i, v1), (j, v2)) = row\n", "    return i, j, v1 * v2\n", "produit = mat_join.map(produit_matriciel)\n", "produit.take(12)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Il ne reste plus qu'\u00e0 agr\u00e9ger [reduceByKey](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.reduceByKey). La documentation fournit un exemple facilement transposable. Elle indique aussi : *Merge the values for each key using an associative and commutative reduce function.* Pourquoi pr\u00e9cise-t-elle **associative et commutative** ? Cela signifie que le r\u00e9sultat ne d\u00e9pend pas de l'ordre dans lequel l'agr\u00e9gation est r\u00e9alis\u00e9e et qu'on peut commencer \u00e0 agr\u00e9ger sans attendre d'avoir regroup\u00e9 toutes les valeurs associ\u00e9es \u00e0 une cl\u00e9.\n", "\n", "* *Cas 1 :* [groupBy](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.groupBy) + agr\u00e9gation qui commence une fois les valeurs regroup\u00e9es\n", "* *Cas 2 :* [reduceByKey](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.reduceByKey) + agr\u00e9gation qui commence d\u00e8s les premi\u00e8res valeurs regroup\u00e9es\n", "\n", "Le cas 2 est moins consommateur en terme de donn\u00e9es. Le cas 1 n'est possible que si les valeurs agr\u00e9g\u00e9es ne sont pas trop nombreuses. Ca tombe bien, dans notre cas, le cas 2 convient."]}, {"cell_type": "code", "execution_count": 11, "metadata": {"scrolled": false}, "outputs": [{"data": {"text/plain": ["[((0, 0), 3.164850046348241),\n", " ((0, 1), 4.559445715024405),\n", " ((1, 0), 2.526790300879841),\n", " ((1, 1), 2.8702356426731646),\n", " ((2, 0), 2.5456884797140247),\n", " ((2, 1), 3.04477187797909),\n", " ((3, 0), 1.9795588879982224),\n", " ((3, 1), 2.623457980006711),\n", " ((4, 0), 2.6288364080082656),\n", " ((4, 1), 4.111449492587058),\n", " ((5, 0), 2.788268947579333),\n", " ((5, 1), 3.498463270496026),\n", " ((6, 0), 2.2139520348118236),\n", " ((6, 1), 2.903554407097735),\n", " ((7, 0), 2.232501586646612),\n", " ((7, 1), 3.213223607913268),\n", " ((8, 0), 2.42321286851472),\n", " ((8, 1), 3.4138285975924623),\n", " ((9, 0), 2.1386203274215574),\n", " ((9, 1), 3.3888181357124005)]"]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["from operator import add\n", "final = produit.map(lambda row: ((row[0], row[1]), row[2])).reduceByKey(add)\n", "aslist = final.collect()\n", "aslist.sort()\n", "aslist"]}, {"cell_type": "markdown", "metadata": {}, "source": ["R\u00e9sultat initial :"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [{"data": {"text/plain": ["array([[ 3.16485005,  4.55944572],\n", "       [ 2.5267903 ,  2.87023564],\n", "       [ 2.54568848,  3.04477188],\n", "       [ 1.97955889,  2.62345798],\n", "       [ 2.62883641,  4.11144949],\n", "       [ 2.78826895,  3.49846327],\n", "       [ 2.21395203,  2.90355441],\n", "       [ 2.23250159,  3.21322361],\n", "       [ 2.42321287,  3.4138286 ],\n", "       [ 2.13862033,  3.38881814]])"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["rnd1 @ rnd2"]}, {"cell_type": "markdown", "metadata": {"collapsed": true}, "source": ["## M\u00eame algorithme avec les Spark DataFrame\n", "\n", "On a besoin de r\u00e9aliser un [flatMap](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.flatMap). Une fa\u00e7on de faire est de cr\u00e9er des colonnes qui sont de type compos\u00e9 : un tableau, une structure. La multiplication des lignes est obtenue avec la fonction [explode](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.functions.explode)."]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["schema = [\"index\"] + [\"c%d\" % i for i in range(1, 11)]\n", "mat1 = spark.createDataFrame(pandas.read_csv(\"rnd1.txt\", header=None, sep=\"\\t\"), schema=schema)"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["root\n", " |-- index: long (nullable = true)\n", " |-- c1: double (nullable = true)\n", " |-- c2: double (nullable = true)\n", " |-- c3: double (nullable = true)\n", " |-- c4: double (nullable = true)\n", " |-- c5: double (nullable = true)\n", " |-- c6: double (nullable = true)\n", " |-- c7: double (nullable = true)\n", " |-- c8: double (nullable = true)\n", " |-- c9: double (nullable = true)\n", " |-- c10: double (nullable = true)\n", "\n"]}], "source": ["mat1.printSchema()"]}, {"cell_type": "code", "execution_count": 15, "metadata": {"collapsed": true}, "outputs": [], "source": ["schema = [\"index\"] + [\"c%d\" % i for i in range(1, 3)]\n", "mat2 = spark.createDataFrame(pandas.read_csv(\"rnd2.txt\", header=None, sep=\"\\t\"), schema=schema)"]}, {"cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["root\n", " |-- index: long (nullable = true)\n", " |-- c1: double (nullable = true)\n", " |-- c2: double (nullable = true)\n", "\n"]}], "source": ["mat2.printSchema()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Nous allons avoir besoin de quelques-uns des fonctions et types suivant :\n", "\n", "* [explode](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.functions.explode), [posexplode](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.functions.posexplode), [array](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.functions.array), [alias](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame.alias)\n", "* [StructType](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.types.StructType), [StructField](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.types.StructField)\n", "* [ArrayType](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType)\n", "* [DoubleType](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.types.DoubleType), [IntegerType](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.types.IntegerType)\n", "\n", "Je recommande le type [FloatType](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.types.FloatType) qui prend deux fois moins de place pour une pr\u00e9cision moindre mais suffisante dans la plupart des cas."]}, {"cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": ["from pyspark.sql.types import ArrayType, StructField, StructType, DoubleType, IntegerType\n", "from pyspark.sql.functions import explode, posexplode, array\n", "from pyspark.sql import Row"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["root\n", " |-- index: long (nullable = true)\n", " |-- x: array (nullable = false)\n", " |    |-- element: double (containsNull = true)\n", "\n"]}], "source": ["cols = [\"c%d\" % i for i in range(1, 11)]\n", "mat1_array = mat1.select(mat1.index, array(*cols).alias(\"x\"))\n", "mat1_array.printSchema()"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["root\n", " |-- index: long (nullable = true)\n", " |-- pos: integer (nullable = false)\n", " |-- col: double (nullable = true)\n", "\n"]}], "source": ["mat1_exploded = mat1_array.select(\"index\", posexplode(\"x\"))\n", "mat1_exploded.printSchema()"]}, {"cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [{"data": {"text/plain": ["((10, 11), (100, 3))"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["mat1.toPandas().shape, mat1_exploded.toPandas().shape"]}, {"cell_type": "markdown", "metadata": {}, "source": ["On recommence le m\u00eame proc\u00e9d\u00e9 pour l'autre matrice."]}, {"cell_type": "code", "execution_count": 21, "metadata": {"collapsed": true}, "outputs": [], "source": ["cols = [\"c%d\" % i for i in range(1, 3)]\n", "mat2_array = mat2.select(mat2.index, array(*cols).alias(\"x\"))\n", "mat2_exploded = mat2_array.select(\"index\", posexplode(\"x\"))"]}, {"cell_type": "markdown", "metadata": {}, "source": ["Il ne reste plus qu'\u00e0 faire le produit avec la m\u00e9thode [join](https://spark.apache.org/docs/2.0.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame.join) apr\u00e8s avoir renomm\u00e9 les colonnes avant la jointure pour \u00e9viter les ambigu\u00eft\u00e9s."]}, {"cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": ["mat2_exp2 = mat2_exploded.withColumnRenamed(\"index\", \"index2\") \\\n", "                         .withColumnRenamed(\"pos\", \"pos2\") \\\n", "                         .withColumnRenamed(\"col\", \"col2\")\n", "produit = mat1_exploded.join(mat2_exp2, mat1_exploded.pos == mat2_exp2.index2)"]}, {"cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["root\n", " |-- index: long (nullable = true)\n", " |-- pos: integer (nullable = false)\n", " |-- col: double (nullable = true)\n", " |-- index2: long (nullable = true)\n", " |-- pos2: integer (nullable = false)\n", " |-- col2: double (nullable = true)\n", "\n"]}], "source": ["produit.printSchema()"]}, {"cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>index</th>\n", "      <th>pos</th>\n", "      <th>col</th>\n", "      <th>index2</th>\n", "      <th>pos2</th>\n", "      <th>col2</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>0</th>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0.964429</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0.797985</td>\n", "    </tr>\n", "    <tr>\n", "      <th>1</th>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0.964429</td>\n", "      <td>0</td>\n", "      <td>1</td>\n", "      <td>0.429407</td>\n", "    </tr>\n", "    <tr>\n", "      <th>2</th>\n", "      <td>1</td>\n", "      <td>0</td>\n", "      <td>0.897878</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0.797985</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>1</td>\n", "      <td>0</td>\n", "      <td>0.897878</td>\n", "      <td>0</td>\n", "      <td>1</td>\n", "      <td>0.429407</td>\n", "    </tr>\n", "    <tr>\n", "      <th>4</th>\n", "      <td>2</td>\n", "      <td>0</td>\n", "      <td>0.997437</td>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>0.797985</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["   index  pos       col  index2  pos2      col2\n", "0      0    0  0.964429       0     0  0.797985\n", "1      0    0  0.964429       0     1  0.429407\n", "2      1    0  0.897878       0     0  0.797985\n", "3      1    0  0.897878       0     1  0.429407\n", "4      2    0  0.997437       0     0  0.797985"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["produit.toPandas().head()"]}, {"cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": ["prod = produit.select(produit.index.alias(\"i\"), produit.pos2.alias(\"j\"),\n", "                         (produit.col * produit.col2).alias(\"val\"))\n", "final = prod.groupby(\"i\", \"j\").sum(\"val\")"]}, {"cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["root\n", " |-- i: long (nullable = true)\n", " |-- j: integer (nullable = false)\n", " |-- sum(val): double (nullable = true)\n", "\n"]}], "source": ["final.printSchema()"]}, {"cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": ["df = final.toPandas()"]}, {"cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [{"data": {"text/html": ["<div>\n", "<table border=\"1\" class=\"dataframe\">\n", "  <thead>\n", "    <tr style=\"text-align: right;\">\n", "      <th></th>\n", "      <th>i</th>\n", "      <th>j</th>\n", "      <th>sum(val)</th>\n", "    </tr>\n", "  </thead>\n", "  <tbody>\n", "    <tr>\n", "      <th>7</th>\n", "      <td>0</td>\n", "      <td>0</td>\n", "      <td>3.164850</td>\n", "    </tr>\n", "    <tr>\n", "      <th>10</th>\n", "      <td>0</td>\n", "      <td>1</td>\n", "      <td>4.559446</td>\n", "    </tr>\n", "    <tr>\n", "      <th>18</th>\n", "      <td>1</td>\n", "      <td>0</td>\n", "      <td>2.526790</td>\n", "    </tr>\n", "    <tr>\n", "      <th>3</th>\n", "      <td>1</td>\n", "      <td>1</td>\n", "      <td>2.870236</td>\n", "    </tr>\n", "    <tr>\n", "      <th>6</th>\n", "      <td>2</td>\n", "      <td>0</td>\n", "      <td>2.545688</td>\n", "    </tr>\n", "  </tbody>\n", "</table>\n", "</div>"], "text/plain": ["    i  j  sum(val)\n", "7   0  0  3.164850\n", "10  0  1  4.559446\n", "18  1  0  2.526790\n", "3   1  1  2.870236\n", "6   2  0  2.545688"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["df.sort_values([\"i\", \"j\"]).head()"]}, {"cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [{"data": {"text/plain": ["(20, 3)"]}, "execution_count": 30, "metadata": {}, "output_type": "execute_result"}], "source": ["df.shape"]}, {"cell_type": "code", "execution_count": 30, "metadata": {"collapsed": true}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4"}}, "nbformat": 4, "nbformat_minor": 2}