{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# example with xgboost\n", "\n", "Test XGBoost after it was compiled, pickle, unpickle."]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n", "<script>\n", "function repeat_indent_string(n){\n", "    var a = \"\" ;\n", "    for ( ; n > 0 ; --n)\n", "        a += \"    \";\n", "    return a;\n", "}\n", "// look up into all sections and builds an automated menu //\n", "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n", "    var anchors = document.getElementsByClassName(\"section\");\n", "    if (anchors.length == 0) {\n", "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n", "    }\n", "    var i,t;\n", "    var text_menu = begin;\n", "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n", "    var ind = \"\";\n", "    var memo_level = 1;\n", "    var href;\n", "    var tags = [];\n", "    var main_item = 0;\n", "    var format_open = 0;\n", "    for (i = 0; i <= llast; i++)\n", "        tags.push(\"h\" + i);\n", "\n", "    for (i = 0; i < anchors.length; i++) {\n", "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n", "\n", "        var child = null;\n", "        for(t = 0; t < tags.length; t++) {\n", "            var r = anchors[i].getElementsByTagName(tags[t]);\n", "            if (r.length > 0) {\n", "child = r[0];\n", "break;\n", "            }\n", "        }\n", "        if (child == null) {\n", "            text_memo += \"null\\n\";\n", "            continue;\n", "        }\n", "        if (anchors[i].hasAttribute(\"id\")) {\n", "            // when converted in RST\n", "            href = anchors[i].id;\n", "            text_memo += \"#1-\" + href;\n", "            // passer \u00e0 child suivant (le chercher)\n", "        }\n", "        else if (child.hasAttribute(\"id\")) {\n", "            // in a notebook\n", "            href = child.id;\n", "            text_memo += \"#2-\" + href;\n", "        }\n", "        else {\n", "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n", "            continue;\n", "        }\n", "        var title = child.textContent;\n", "        var level = parseInt(child.tagName.substring(1,2));\n", "\n", "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n", "\n", "        if ((level < lfirst) || (level > llast)) {\n", "            continue ;\n", "        }\n", "        if (title.endsWith('\u00b6')) {\n", "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n", "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n", "        }\n", "        if (title.length == 0) {\n", "            continue;\n", "        }\n", "\n", "        while (level < memo_level) {\n", "            text_menu += end_format + \"</ul>\\n\";\n", "            format_open -= 1;\n", "            memo_level -= 1;\n", "        }\n", "        if (level == lfirst) {\n", "            main_item += 1;\n", "        }\n", "        if (keep_item != -1 && main_item != keep_item + 1) {\n", "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n", "            continue;\n", "        }\n", "        while (level > memo_level) {\n", "            text_menu += \"<ul>\\n\";\n", "            memo_level += 1;\n", "        }\n", "        text_menu += repeat_indent_string(level-2);\n", "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n", "        format_open += 1;\n", "    }\n", "    while (1 < memo_level) {\n", "        text_menu += end_format + \"</ul>\\n\";\n", "        memo_level -= 1;\n", "        format_open -= 1;\n", "    }\n", "    text_menu += send;\n", "    //text_menu += \"\\n\" + text_memo;\n", "\n", "    while (format_open > 0) {\n", "        text_menu += end_format;\n", "        format_open -= 1;\n", "    }\n", "    return text_menu;\n", "};\n", "var update_menu = function() {\n", "    var sbegin = \"\";\n", "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n", "    var send = \"\";\n", "    var begin_format = '<li>';\n", "    var end_format = '</li>';\n", "    var keep_item = -1;\n", "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n", "       begin_format, end_format);\n", "    var menu = document.getElementById(\"my_id_menu_nb\");\n", "    menu.innerHTML=text_menu;\n", "};\n", "window.setTimeout(update_menu,2000);\n", "            </script>"], "text/plain": ["<IPython.core.display.HTML object>"]}, "execution_count": 2, "metadata": {}, "output_type": "execute_result"}], "source": ["from jyquickhelper import add_notebook_menu\n", "add_notebook_menu()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["This is an example taken from [xgboost website](https://github.com/dmlc/xgboost)."]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": ["import pickle\n", "\n", "import numpy as np\n", "from sklearn.model_selection import KFold, train_test_split\n", "from sklearn.metrics import confusion_matrix, mean_squared_error\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.datasets import load_iris, load_digits, load_diabetes"]}, {"cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": ["import xgboost as xgb"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Zeros and Ones from the Digits dataset: binary classification"]}, {"cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [{"data": {"text/plain": ["[array([[87,  0],\n", "        [ 1, 92]], dtype=int64), array([[91,  0],\n", "        [ 3, 86]], dtype=int64)]"]}, "execution_count": 5, "metadata": {}, "output_type": "execute_result"}], "source": ["rng = np.random.RandomState(31337)\n", "\n", "digits = load_digits(2)\n", "y = digits['target']\n", "X = digits['data']\n", "conf = []\n", "kf = KFold(n_splits=2, shuffle=True, random_state=rng)\n", "for train_index, test_index in kf.split(X, y):\n", "    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])\n", "    predictions = xgb_model.predict(X[test_index])\n", "    actuals = y[test_index]\n", "    conf.append(confusion_matrix(actuals, predictions))\n", "conf"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Iris: multiclass classification"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [{"data": {"text/plain": ["[array([[19,  0,  0],\n", "        [ 0, 31,  3],\n", "        [ 0,  1, 21]], dtype=int64), array([[31,  0,  0],\n", "        [ 0, 16,  0],\n", "        [ 0,  3, 25]], dtype=int64)]"]}, "execution_count": 6, "metadata": {}, "output_type": "execute_result"}], "source": ["iris = load_iris()\n", "y = iris['target']\n", "X = iris['data']\n", "kf = KFold(n_splits=2, shuffle=True, random_state=rng)\n", "conf = []\n", "for train_index, test_index in kf.split(X, y):\n", "    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])\n", "    predictions = xgb_model.predict(X[test_index])\n", "    actuals = y[test_index]\n", "    conf.append(confusion_matrix(actuals, predictions))\n", "conf"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Diabetes: regression"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"data": {"text/plain": ["[9.860776812557337, 15.942418468446029]"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["data = load_diabetes()\n", "y = data['target']\n", "X = data['data']\n", "err = []\n", "kf = KFold(n_splits=2, shuffle=True, random_state=rng)\n", "for train_index, test_index in kf.split(X, y):\n", "    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])\n", "    predictions = xgb_model.predict(X[test_index])\n", "    actuals = y[test_index]\n", "    err.append(mean_squared_error(actuals, predictions))\n", "err"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Parameter optimization"]}, {"cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Fitting 5 folds for each of 9 candidates, totalling 45 fits\n"]}, {"name": "stderr", "output_type": "stream", "text": ["[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", "[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    2.3s finished\n", "c:\\python370_x64\\lib\\site-packages\\sklearn\\model_selection\\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", "  DeprecationWarning)\n"]}, {"data": {"text/plain": ["(0.6699572097100618, {'max_depth': 2, 'n_estimators': 100})"]}, "execution_count": 8, "metadata": {}, "output_type": "execute_result"}], "source": ["import joblib  # to check you can parallelize GridSearchCV\n", "y = boston['target']\n", "X = boston['data']\n", "xgb_model = xgb.XGBRegressor()\n", "clf = GridSearchCV(xgb_model,\n", "                   {'max_depth': [2,4,6],\n", "                    'n_estimators': [50,100,200]}, verbose=1, n_jobs=1, pre_dispatch=1, cv=5)\n", "clf.fit(X,y)\n", "clf.best_score_, clf.best_params_"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Pickling sklearn API models"]}, {"cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [{"data": {"text/plain": ["True"]}, "execution_count": 9, "metadata": {}, "output_type": "execute_result"}], "source": ["# The sklearn API models are picklable\n", "# must open in binary format to pickle\n", "pickle.dump(clf, open(\"best_boston.pkl\", \"wb\"))\n", "clf2 = pickle.load(open(\"best_boston.pkl\", \"rb\"))\n", "np.allclose(clf.predict(X), clf2.predict(X))"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### Early stopping"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["[0]\tvalidation_0-auc:0.999497\n", "Will train until validation_0-auc hasn't improved in 10 rounds.\n", "[1]\tvalidation_0-auc:0.999497\n", "[2]\tvalidation_0-auc:0.999497\n", "[3]\tvalidation_0-auc:0.999749\n", "[4]\tvalidation_0-auc:0.999749\n", "[5]\tvalidation_0-auc:0.999749\n", "[6]\tvalidation_0-auc:0.999749\n", "[7]\tvalidation_0-auc:0.999749\n", "[8]\tvalidation_0-auc:0.999749\n", "[9]\tvalidation_0-auc:0.999749\n", "[10]\tvalidation_0-auc:1\n", "[11]\tvalidation_0-auc:1\n", "[12]\tvalidation_0-auc:1\n", "[13]\tvalidation_0-auc:1\n", "[14]\tvalidation_0-auc:1\n", "[15]\tvalidation_0-auc:1\n", "[16]\tvalidation_0-auc:1\n", "[17]\tvalidation_0-auc:1\n", "[18]\tvalidation_0-auc:1\n", "[19]\tvalidation_0-auc:1\n", "[20]\tvalidation_0-auc:1\n", "Stopping. Best iteration:\n", "[10]\tvalidation_0-auc:1\n", "\n"]}, {"data": {"text/plain": ["XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", "       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", "       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n", "       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,\n", "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", "       silent=True, subsample=1)"]}, "execution_count": 10, "metadata": {}, "output_type": "execute_result"}], "source": ["X = digits['data']\n", "y = digits['target']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", "clf = xgb.XGBClassifier()\n", "clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=\"auc\",\n", "        eval_set=[(X_test, y_test)])"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": []}], "metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}}, "nbformat": 4, "nbformat_minor": 2}