Full set of fields

6 years ago · 76645f273d
commit 76645f273d
5 changed files with 995 additions and 0 deletions
--- a/export.py
+++ b/export.py
@ -0,0 +1,115 @@
+from pymongo import MongoClient
+from pprint import pprint
+from datetime import datetime
+import csv
+client = MongoClient()
+db=client.bonitoo
+
+fieldnames = [
+'timestamp',
+'client.channel',
+'type',
+'flight.inboundSegments.departure',
+'flight.inboundSegments.arrival',
+'flight.inboundSegments.origin.airportCode',
+'flight.inboundSegments.destination.airportCode',
+'flight.inboundSegments.flightNumber',
+'flight.inboundSegments.travelClass',
+'flight.inboundSegments.bookingCode',
+'flight.inboundSegments.availability',
+'flight.inboundSegments.elapsedFlyingTime',
+'flight.outboundSegments.departure',
+'flight.outboundSegments.arrival',
+'flight.outboundSegments.origin.airportCode',
+'flight.outboundSegments.destination.airportCode',
+'flight.outboundSegments.flightNumber',
+'flight.outboundSegments.travelClass',
+'flight.outboundSegments.bookingCode',
+'flight.outboundSegments.availability',
+'flight.outboundSegments.elapsedFlyingTime',
+'flight.inboundEFT', # elapsed flying time
+'flight.outboundEFT',
+'oneWay',
+'adults', #  pocet osob = (adults + children)
+'children',
+'infants',
+'input.price',
+'input.tax',
+'input.currency',
+'success',
+'status',
+'output.price',
+'output.tax',
+'output.currency',
+'duration' # delka volani do nadrazeneho systemu
+]
+
+# 5% nebo 200 kc rozdil nahoru
+# -200 kc dolu
+# abs(+-10kc) ignorovat
+
+# timestamp + ok price - ma byt v cache od cacheat
+# timestamp + notok price - nema byt v cache od cacheat
+
+# delka pobytu prilet-odlet
+# delka letu ?
+# pokud je chyba tak nocache (= chybi priceout) 
+
+# brat v uvahu in/out kody aerolinek (mcx ?) - mirek jeste zjisti
+
+# vypocitat uspesnost je/neni v cache v %
+
+counter = 0
+with open('export.csv', mode='w') as ef:
+  writer = csv.DictWriter(ef, fieldnames=fieldnames, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+  # do not write header for s3 files
+  # writer.writeheader()
+
+  for it in db.pricing_audit.find():
+    counter += 1
+    if counter % 1000 == 0:
+      print('Iterace %d' % counter)
+    d = {
+      'timestamp': datetime.fromtimestamp(it['timestamp']/1000).isoformat(),
+      'client.channel': it['client']['channel'],
+      'type': it['type'],
+      'flight.outboundSegments.departure': '|'.join([x['departure'].isoformat() for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.arrival': '|'.join([x['arrival'].isoformat() for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.origin.airportCode': '|'.join([x['origin']['airportCode'] for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.destination.airportCode': '|'.join([x['destination']['airportCode'] for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.flightNumber': '|'.join([x['flightNumber'] for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.travelClass': '|'.join([x['travelClass'] for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.bookingCode': '|'.join([x.get('bookingCode','') for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.availability': '|'.join([str(x.get('availability','')) for x in it['flight']['outboundSegments']]),
+      'flight.outboundSegments.elapsedFlyingTime': '|'.join([str(x.get('elapsedFlyingTime','')) for x in it['flight']['outboundSegments']]),
+      'flight.inboundEFT': it['flight'].get('inboundEFT',''),
+      'flight.outboundEFT': it['flight'].get('outboundEFT',''),
+      'oneWay': it['oneWay'],
+      'adults': it['adults'],
+      'children': it['children'],
+      'infants': it['infants'],
+      'input.price': it['input']['price'],
+      'input.tax': it['input']['tax'],
+      'input.currency': it['input']['currency'],
+      'success': it['success'],
+      'status': it.get('status',''),
+      'output.price': it.get('output', {'price': 0})['price'],
+      'output.tax': it.get('output', {'tax': 0})['tax'],
+      'output.currency': it.get('output', {'currency': 0})['currency'],
+      'duration': it['duration']
+    }
+    
+    if 'inboundSegments' in it['flight']:
+      inb = {
+        'flight.inboundSegments.departure': '|'.join([x['departure'].isoformat() for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.arrival': '|'.join([x['arrival'].isoformat() for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.origin.airportCode': '|'.join([x['origin']['airportCode'] for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.destination.airportCode': '|'.join([x['destination']['airportCode'] for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.flightNumber': '|'.join([x['flightNumber'] for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.travelClass': '|'.join([x['travelClass'] for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.bookingCode': '|'.join([x.get('bookingCode', '') for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.availability': '|'.join([str(x.get('availability','')) for x in it['flight']['inboundSegments']]),
+        'flight.inboundSegments.elapsedFlyingTime': '|'.join([str(x.get('elapsedFlyingTime','')) for x in it['flight']['inboundSegments']])
+      }
+      d = {**d, **inb}
+    writer.writerow(d)
--- a/external.ipynb
+++ b/external.ipynb
@ -0,0 +1,303 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sagemaker\n",
+    "import boto3\n",
+    "from sagemaker import get_execution_role\n",
+    "\n",
+    "boto_session = boto3.Session(profile_name='bonitoo', region_name='eu-central-1')\n",
+    "sagemaker_session = sagemaker.LocalSession(boto_session=boto_session)\n",
+    "#sagemaker_session = sagemaker.Session()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get a SageMaker-compatible role used by this Notebook Instance.\n",
+    "#role = get_execution_role()\n",
+    "role = 'Bonitoo_SageMaker_Execution'\n",
+    "region = boto_session.region_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_input = 's3://customers-bonitoo-cachettl/sagemaker/data/export.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.xgboost.estimator import XGBoost\n",
+    "\n",
+    "tf = XGBoost(\n",
+    "    entry_point='train_model.py',\n",
+    "    source_dir='./src',\n",
+    "    train_instance_type='local',\n",
+    "    train_instance_count=1,\n",
+    "    role=role,\n",
+    "    sagemaker_session=sagemaker_session,\n",
+    "    framework_version='0.90-1',\n",
+    "    py_version='py3',\n",
+    "    hyperparameters={\n",
+    "        'bonitoo_price_limit': 1000,\n",
+    "        'num_round': 15,\n",
+    "        'max_depth': 15,\n",
+    "        'eta': 0.5,\n",
+    "        'num_class': 8,\n",
+    "        'objective': 'multi:softmax',\n",
+    "        'eval_metric': 'mlogloss'\n",
+    "    })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating tmptao5hpuc_algo-1-x6dhm_1 ... \n",
+      "\u001b[1BAttaching to tmptao5hpuc_algo-1-x6dhm_12mdone\u001b[0m\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker_xgboost_container.training:Invoking user training script.\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:Module train_model does not provide a setup.py. \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Generating setup.py\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:Generating setup.cfg\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:Generating MANIFEST.in\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:Installing module with the following command:\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m /usr/bin/python3 -m pip install . -r requirements.txt\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Processing /opt/ml/code\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Requirement already satisfied: pandas in /usr/local/lib/python3.5/dist-packages (from -r requirements.txt (line 1)) (0.24.2)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Requirement already satisfied: numpy in /usr/local/lib/python3.5/dist-packages (from -r requirements.txt (line 2)) (1.17.2)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2019.2)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Requirement already satisfied: python-dateutil>=2.5.0 in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2.8.0)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.5/dist-packages (from python-dateutil>=2.5.0->pandas->-r requirements.txt (line 1)) (1.12.0)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Building wheels for collected packages: train-model\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   Building wheel for train-model (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \u001b[?25h  Created wheel for train-model: filename=train_model-1.0.0-py2.py3-none-any.whl size=6578 sha256=f2f4bac7a2d0260f534e32b3ac0341fb291f30669499adf59ead09aa62b7ccc5\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   Stored in directory: /tmp/pip-ephem-wheel-cache-vdfjugbr/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Successfully built train-model\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Installing collected packages: train-model\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Successfully installed train-model-1.0.0\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:sagemaker-containers:Invoking user script\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Training Env:\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m {\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"network_interface_name\": \"eth0\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"hosts\": [\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"algo-1-x6dhm\"\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     ],\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"log_level\": 20,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"input_config_dir\": \"/opt/ml/input/config\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"framework_module\": \"sagemaker_xgboost_container.training:main\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"input_dir\": \"/opt/ml/input\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"channel_input_dirs\": {\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"training\": \"/opt/ml/input/data/training\"\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     },\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"num_gpus\": 0,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"job_name\": \"sagemaker-xgboost-2019-10-05-20-16-58-398\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"user_entry_point\": \"train_model.py\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"current_host\": \"algo-1-x6dhm\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"module_dir\": \"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"master_hostname\": \"algo-1-x6dhm\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"module_name\": \"train_model\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"resource_config\": {\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"current_host\": \"algo-1-x6dhm\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"hosts\": [\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m             \"algo-1-x6dhm\"\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         ]\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     },\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"additional_framework_parameters\": {},\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"num_cpus\": 6,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"output_data_dir\": \"/opt/ml/output/data\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"input_data_config\": {\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"training\": {\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m             \"TrainingInputMode\": \"File\"\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         }\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     },\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"is_master\": true,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"hyperparameters\": {\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"bonitoo_price_limit\": 1000,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"max_depth\": 15,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"objective\": \"multi:softmax\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"num_class\": 8,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"eta\": 0.5,\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"eval_metric\": \"mlogloss\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m         \"num_round\": 15\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     },\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"output_dir\": \"/opt/ml/output\",\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"model_dir\": \"/opt/ml/model\"\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m }\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Environment variables:\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_INPUT_CONFIG_DIR=/opt/ml/input/config\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HP_MAX_DEPTH=15\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_LOG_LEVEL=20\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_OUTPUT_DIR=/opt/ml/output\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_NUM_CPUS=6\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_CHANNELS=[\"training\"]\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HP_NUM_ROUND=15\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HP_OBJECTIVE=multi:softmax\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_OUTPUT_DATA_DIR=/opt/ml/output/data\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_FRAMEWORK_MODULE=sagemaker_xgboost_container.training:main\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_NETWORK_INTERFACE_NAME=eth0\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_INPUT_DATA_CONFIG={\"training\":{\"TrainingInputMode\":\"File\"}}\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_FRAMEWORK_PARAMS={}\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HPS={\"bonitoo_price_limit\":1000,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":15,\"objective\":\"multi:softmax\"}\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m PYTHONPATH=/usr/local/bin:/:/usr/local/lib/python3.5/dist-packages/xgboost/dmlc-core/tracker:/usr/lib/python35.zip:/usr/lib/python3.5:/usr/lib/python3.5/plat-x86_64-linux-gnu:/usr/lib/python3.5/lib-dynload:/usr/local/lib/python3.5/dist-packages:/usr/lib/python3/dist-packages\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_RESOURCE_CONFIG={\"current_host\":\"algo-1-x6dhm\",\"hosts\":[\"algo-1-x6dhm\"]}\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_NUM_GPUS=0\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HP_ETA=0.5\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HP_NUM_CLASS=8\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_MODULE_DIR=s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_USER_ARGS=[\"--bonitoo_price_limit\",\"1000\",\"--eta\",\"0.5\",\"--eval_metric\",\"mlogloss\",\"--max_depth\",\"15\",\"--num_class\",\"8\",\"--num_round\",\"15\",\"--objective\",\"multi:softmax\"]\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_USER_ENTRY_POINT=train_model.py\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_CURRENT_HOST=algo-1-x6dhm\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_INPUT_DIR=/opt/ml/input\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_CHANNEL_TRAINING=/opt/ml/input/data/training\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HP_EVAL_METRIC=mlogloss\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_MODULE_NAME=train_model\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HP_BONITOO_PRICE_LIMIT=1000\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_HOSTS=[\"algo-1-x6dhm\"]\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1-x6dhm\",\"framework_module\":\"sagemaker_xgboost_container.training:main\",\"hosts\":[\"algo-1-x6dhm\"],\"hyperparameters\":{\"bonitoo_price_limit\":1000,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":15,\"objective\":\"multi:softmax\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"training\":{\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"sagemaker-xgboost-2019-10-05-20-16-58-398\",\"log_level\":20,\"master_hostname\":\"algo-1-x6dhm\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\",\"module_name\":\"train_model\",\"network_interface_name\":\"eth0\",\"num_cpus\":6,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1-x6dhm\",\"hosts\":[\"algo-1-x6dhm\"]},\"user_entry_point\":\"train_model.py\"}\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m SM_MODEL_DIR=/opt/ml/model\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Invoking script with the following command:\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m /usr/bin/python3 -m train_model --bonitoo_price_limit 1000 --eta 0.5 --eval_metric mlogloss --max_depth 15 --num_class 8 --num_round 15 --objective multi:softmax\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m ERROR:sagemaker-containers:ExecuteUserScriptError:\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Command \"/usr/bin/python3 -m train_model --bonitoo_price_limit 1000 --eta 0.5 --eval_metric mlogloss --max_depth 15 --num_class 8 --num_round 15 --objective multi:softmax\"\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:hyperparameters {'num_round': 15, 'num_class': 8, 'objective': 'multi:softmax', 'eta': 0.5, 'max_depth': 15, 'eval_metric': ['mlogloss']}\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:channels {'training': {'TrainingInputMode': 'File'}}\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Determined delimiter of CSV input is ','\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Loading csv file export.csv\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Preprocessing start\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m /usr/local/lib/python3.5/dist-packages/pandas/core/indexing.py:543: SettingWithCopyWarning: \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   self.obj[item] = s\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m /usr/local/lib/python3.5/dist-packages/pandas/core/indexing.py:362: SettingWithCopyWarning: \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m \n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   self.obj[key] = _infer_fill_value(value)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Computing cached times\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Splitting dataset with ration 0.800000\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m /usr/local/lib/python3.5/dist-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   if getattr(data, 'base', None) is not None and \\\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m /usr/local/lib/python3.5/dist-packages/xgboost/core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   data.base is not None and isinstance(data, np.ndarray) \\\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Single node training.\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Train matrix has 25393 rows\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:Validation matrix has 6314 rows\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m INFO:root:cols: ['index', 'adults', 'children', 'infants', 'input.price', 'input.tax', 'status', 'output.price', 'output.tax', 'duration', 'client.channel_codes', 'type_codes', 'flight.inboundSegments.departure_codes', 'flight.inboundSegments.arrival_codes', 'flight.inboundSegments.origin.airportCode_codes', 'flight.inboundSegments.destination.airportCode_codes', 'flight.inboundSegments.flightNumber_codes', 'flight.inboundSegments.travelClass_codes', 'flight.inboundSegments.bookingCode_codes', 'flight.inboundSegments.availability_codes', 'flight.inboundSegments.elapsedFlyingTime_codes', 'flight.outboundSegments.departure_codes', 'flight.outboundSegments.arrival_codes', 'flight.outboundSegments.origin.airportCode_codes', 'flight.outboundSegments.destination.airportCode_codes', 'flight.outboundSegments.flightNumber_codes', 'flight.outboundSegments.travelClass_codes', 'flight.outboundSegments.bookingCode_codes', 'flight.outboundSegments.availability_codes', 'flight.outboundSegments.elapsedFlyingTime_codes', 'flight.inboundEFT_codes', 'flight.outboundEFT_codes', 'input.currency_codes', 'output.currency_codes', 'oneWay_codes']\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Traceback (most recent call last):\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   File \"/usr/lib/python3.5/runpy.py\", line 184, in _run_module_as_main\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     \"__main__\", mod_spec)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   File \"/usr/lib/python3.5/runpy.py\", line 85, in _run_code\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     exec(code, run_globals)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   File \"/opt/ml/code/train_model.py\", line 417, in <module>\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     checkpoint_config=checkpoint_config\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   File \"/opt/ml/code/train_model.py\", line 320, in sagemaker_train\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     train_job(**train_args)\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m   File \"/opt/ml/code/train_model.py\", line 364, in train_job\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m     raise Exception(\"cols: %s\" % str(train_dmatrix.feature_names))\n",
+      "\u001b[36malgo-1-x6dhm_1  |\u001b[0m Exception: cols: ['index', 'adults', 'children', 'infants', 'input.price', 'input.tax', 'status', 'output.price', 'output.tax', 'duration', 'client.channel_codes', 'type_codes', 'flight.inboundSegments.departure_codes', 'flight.inboundSegments.arrival_codes', 'flight.inboundSegments.origin.airportCode_codes', 'flight.inboundSegments.destination.airportCode_codes', 'flight.inboundSegments.flightNumber_codes', 'flight.inboundSegments.travelClass_codes', 'flight.inboundSegments.bookingCode_codes', 'flight.inboundSegments.availability_codes', 'flight.inboundSegments.elapsedFlyingTime_codes', 'flight.outboundSegments.departure_codes', 'flight.outboundSegments.arrival_codes', 'flight.outboundSegments.origin.airportCode_codes', 'flight.outboundSegments.destination.airportCode_codes', 'flight.outboundSegments.flightNumber_codes', 'flight.outboundSegments.travelClass_codes', 'flight.outboundSegments.bookingCode_codes', 'flight.outboundSegments.availability_codes', 'flight.outboundSegments.elapsedFlyingTime_codes', 'flight.inboundEFT_codes', 'flight.outboundEFT_codes', 'input.currency_codes', 'output.currency_codes', 'oneWay_codes']\n",
+      "\u001b[36mtmptao5hpuc_algo-1-x6dhm_1 exited with code 1\n",
+      "\u001b[0mAborting on container exit...\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "Failed to run: ['docker-compose', '-f', '/tmp/tmptao5hpuc/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m    147\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 148\u001b[0;31m             \u001b[0m_stream_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    149\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36m_stream_output\u001b[0;34m(process)\u001b[0m\n\u001b[1;32m    656\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mexit_code\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 657\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Process exited with code: %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mexit_code\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    658\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Process exited with code: 1",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-51-ffc1ca8d95fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mestimator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'training'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtrain_input\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;31m#estimator = sklearn.attach('sagemaker-scikit-learn-2019-01-25-16-34-38-829')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name)\u001b[0m\n\u001b[1;32m    337\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_for_training\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    338\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 339\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_TrainingJob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart_new\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    340\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    341\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mstart_new\u001b[0;34m(cls, estimator, inputs)\u001b[0m\n\u001b[1;32m    861\u001b[0m         \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_add_spot_checkpoint_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_mode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    862\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 863\u001b[0;31m         \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mtrain_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    864\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    865\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_current_job_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path)\u001b[0m\n\u001b[1;32m    390\u001b[0m         \u001b[0mLOGGER\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Creating training-job with name: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    391\u001b[0m         \u001b[0mLOGGER\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"train request: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_request\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 392\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_training_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mtrain_request\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    393\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    394\u001b[0m     def compile_model(\n",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/local_session.py\u001b[0m in \u001b[0;36mcreate_training_job\u001b[0;34m(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)\u001b[0m\n\u001b[1;32m     99\u001b[0m         \u001b[0mtraining_job\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_LocalTrainingJob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    100\u001b[0m         \u001b[0mhyperparameters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"HyperParameters\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"HyperParameters\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 101\u001b[0;31m         \u001b[0mtraining_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mInputDataConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mOutputDataConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhyperparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTrainingJobName\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    103\u001b[0m         \u001b[0mLocalSagemakerClient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_training_jobs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mTrainingJobName\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining_job\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/entities.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m         self.model_artifacts = self.container.train(\n\u001b[0;32m---> 89\u001b[0;31m             \u001b[0minput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhyperparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     90\u001b[0m         )\n\u001b[1;32m     91\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m    151\u001b[0m             \u001b[0;31m# which contains the exit code and append the command line to it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    152\u001b[0m             \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Failed to run: %s, %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mcompose_command\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 153\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    154\u001b[0m         \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    155\u001b[0m             \u001b[0martifacts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompose_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Failed to run: ['docker-compose', '-f', '/tmp/tmptao5hpuc/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1"
+     ]
+    }
+   ],
+   "source": [
+    "estimator = tf.fit({'training': train_input})\n",
+    "#estimator = sklearn.attach('sagemaker-scikit-learn-2019-01-25-16-34-38-829')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/src/requirements.txt
+++ b/src/requirements.txt
@ -0,0 +1,2 @@
+pandas
+numpy
--- a/src/train_model.py
+++ b/src/train_model.py
@ -0,0 +1,420 @@
+import os
+import json
+import logging
+import argparse
+import pandas as pd
+import xgboost as xgb
+import numpy as np
+
+from sagemaker_algorithm_toolkit import exceptions as exc
+from sagemaker_xgboost_container.constants import sm_env_constants
+from sagemaker_xgboost_container.data_utils import get_content_type, get_dmatrix, get_size, validate_data_file_path
+from sagemaker_xgboost_container import distributed
+from sagemaker_xgboost_container import checkpointing
+from sagemaker_xgboost_container.algorithm_mode import channel_validation as cv
+from sagemaker_xgboost_container.algorithm_mode import hyperparameter_validation as hpv
+from sagemaker_xgboost_container.algorithm_mode import metrics as metrics_mod
+from sagemaker_xgboost_container.algorithm_mode import train_utils
+from sagemaker_xgboost_container.constants.xgb_constants import CUSTOMER_ERRORS
+
+columns = [
+    'timestamp',
+    'client.channel',
+    'type',
+    'flight.inboundSegments.departure',
+    'flight.inboundSegments.arrival',
+    'flight.inboundSegments.origin.airportCode',
+    'flight.inboundSegments.destination.airportCode',
+    'flight.inboundSegments.flightNumber',
+    'flight.inboundSegments.travelClass',
+    'flight.inboundSegments.bookingCode',
+    'flight.inboundSegments.availability',
+    'flight.inboundSegments.elapsedFlyingTime',
+    'flight.outboundSegments.departure',
+    'flight.outboundSegments.arrival',
+    'flight.outboundSegments.origin.airportCode',
+    'flight.outboundSegments.destination.airportCode',
+    'flight.outboundSegments.flightNumber',
+    'flight.outboundSegments.travelClass',
+    'flight.outboundSegments.bookingCode',
+    'flight.outboundSegments.availability',
+    'flight.outboundSegments.elapsedFlyingTime',
+    'flight.inboundEFT',
+    'flight.outboundEFT',
+    'oneWay',
+    'adults',
+    'children',
+    'infants',
+    'input.price',
+    'input.tax',
+    'input.currency',
+    'success',
+    'status',
+    'output.price',
+    'output.tax',
+    'output.currency',
+    'duration'
+]
+
+catcolumns = [
+    'client.channel',
+    'type',
+    'flight.inboundSegments.departure',
+    'flight.inboundSegments.arrival',
+    'flight.inboundSegments.origin.airportCode',
+    'flight.inboundSegments.destination.airportCode',
+    'flight.inboundSegments.flightNumber',
+    'flight.inboundSegments.travelClass',
+    'flight.inboundSegments.bookingCode',
+    'flight.inboundSegments.availability',
+    'flight.inboundSegments.elapsedFlyingTime',
+    'flight.outboundSegments.departure',
+    'flight.outboundSegments.arrival',
+    'flight.outboundSegments.origin.airportCode',
+    'flight.outboundSegments.destination.airportCode',
+    'flight.outboundSegments.flightNumber',
+    'flight.outboundSegments.travelClass',
+    'flight.outboundSegments.bookingCode',
+    'flight.outboundSegments.availability',
+    'flight.outboundSegments.elapsedFlyingTime',
+    'flight.inboundEFT',
+    'flight.outboundEFT',
+    'input.currency',
+    'output.currency',
+    'oneWay'
+]
+
+floatcolumns = [
+    'input.price',
+    'input.tax',
+    'output.price',
+    'output.tax'
+]
+
+intcolumns = [
+    'adults',
+    'children',
+    'infants',
+    'status',
+    'duration'
+]
+
+pkcolumns = [
+    'flight.inboundSegments.departure',
+    'flight.inboundSegments.origin.airportCode',
+    'flight.inboundSegments.destination.airportCode',
+    'flight.outboundSegments.departure',
+    'flight.outboundSegments.origin.airportCode',
+    'flight.outboundSegments.destination.airportCode'
+]
+
+def expected_value(row, cachetime_df, price_limit=1000):
+    # TODO sum tax + price ?
+    inprice, outprice = row['input.price'], row['output.price']
+    pricestatus = abs(inprice - outprice)
+    
+    # TODO correct in cache time ?
+    timestamps = cachetime_df.loc[row[pkcolumns].fillna(''),'timestamp']
+    tdiff = timestamps['max'] - timestamps['min']
+    if tdiff.shape[0] > 0:
+        incachetime = tdiff[0]
+    else:
+        incachetime = np.timedelta64('NaT')
+    
+    modifier = 0
+    if pricestatus > price_limit:
+        modifier = -1
+    if pricestatus < 0.1:
+        if not incachetime:
+            return 3
+        modifier = 1
+
+    if pd.isnull(incachetime):
+        return 0
+
+    if incachetime <= np.timedelta64(12,'h'):
+        return 1 + modifier
+    if incachetime <= np.timedelta64(1,'D'):
+        return 2 + modifier
+    if incachetime <= np.timedelta64(2,'D'):
+        return 3 + modifier
+    if incachetime <= np.timedelta64(3,'D'):
+        return 4 + modifier
+    if incachetime <= np.timedelta64(7,'D'):
+        return 5 + modifier
+    if incachetime <= np.timedelta64(14,'D'):
+        return 6 + modifier
+    
+    return min(7, 7 + modifier)
+
+def compute_cached_time(df):
+    logging.info('Computing cached times')
+    return df.set_index(pkcolumns).groupby(pkcolumns).agg({'timestamp': ['min', 'max']})
+
+def preprocess_data(df):
+    logging.info('Preprocessing start')
+    df = df[df.loc[:, 'success'] == True]
+
+    df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].apply(lambda x: pd.to_datetime(x))
+    
+    booleanDictionary = {True: 'TRUE', False: 'FALSE'}
+    df.loc[:, 'oneWay'] = df.loc[:, 'oneWay'].replace(booleanDictionary)
+    
+    for cc in catcolumns:
+        df.loc[:, cc] = df.loc[:, cc].astype('category')
+        df.loc[:, '%s_codes' % cc] = df[cc].cat.codes
+
+    df.loc[:, floatcolumns] = df.loc[:, floatcolumns].astype('float64')
+    df.loc[:, intcolumns] = df.loc[:, intcolumns].fillna(-1).astype('int32')
+   
+    return df
+
+def remove_non_features(df):
+    return df.drop(['timestamp', 'success'] + catcolumns, axis=1), df
+
+def train_test_split(df, label, ratio):
+    logging.info('Splitting dataset with ration %f', ratio)
+
+    msk = np.random.rand(len(df)) < ratio
+    train_data = df[msk]
+    test_data = df[~msk]
+    train_label = label[msk]
+    test_label = label[~msk]
+
+    train_data = train_data.reset_index()
+    test_data = test_data.reset_index()
+    train_label = train_label.reset_index()
+    test_label = test_label.reset_index()
+    
+    return train_data, test_data, train_label, test_label
+
+def get_csv_pandas(files_path):
+    csv_file = files_path if os.path.isfile(files_path) else [
+        f for f in os.listdir(files_path) if os.path.isfile(os.path.join(files_path, f))][0]
+
+    try:
+        logging.info('Loading csv file %s', csv_file)
+
+        df = pd.read_csv(os.path.join(files_path, csv_file), header=None)
+        df.columns = columns
+        #raise "cols: " + df.info()
+        return df
+
+    except Exception as e:
+        raise exc.UserError("Failed to load csv data with exception:\n{}".format(e))
+
+def get_pandas_df(data_path):
+    if not os.path.exists(data_path):
+        return None
+    else:
+        if os.path.isfile(data_path):
+            files_path = data_path
+        else:
+            for root, dirs, files in os.walk(data_path):
+                if dirs == []:
+                    files_path = root
+                    break
+        df = get_csv_pandas(files_path)
+
+    return df
+
+
+def get_df(train_path, validate_path, content_type='text/csv'):
+    train_files_size = get_size(train_path) if train_path else 0
+    val_files_size = get_size(validate_path) if validate_path else 0
+
+    logging.debug("File size need to be processed in the node: {}mb.".format(
+        round((train_files_size + val_files_size) / (1024 * 1024), 2)))
+
+    if train_files_size > 0:
+        validate_data_file_path(train_path, content_type)
+    if val_files_size > 0:
+        validate_data_file_path(validate_path, content_type)
+
+    train_pandas = get_pandas_df(train_path) if train_files_size > 0 else None
+    val_pandas = get_pandas_df(validate_path) if val_files_size > 0 else None
+
+    return train_pandas, val_pandas
+
+def get_dmatrices(train_pandas, train_label_pandas, val_pandas, val_label_pandas, ratio=0.8):
+    if val_pandas:
+        train_dmatrix = xgb.DMatrix(train_pandas, label=train_label_pandas.loc[:, 'label'])
+        val_dmatrix = xgb.DMatrix(val_pandas, label=val_label_pandas.loc[:, 'label'])
+    else:
+        train_data, test_data, train_label, test_label = train_test_split(train_pandas, train_label_pandas, ratio)
+        train_dmatrix = xgb.DMatrix(train_data, label=train_label.loc[:, 'label'])
+        val_dmatrix = xgb.DMatrix(test_data, label=test_label.loc[:, 'label'])
+        
+    return train_dmatrix, val_dmatrix
+
+def save_encoders(encoder_location, df):
+    logging.info('Saving encoders')
+
+    jsondata = {}
+    for cc in catcolumns:
+        jsondata[cc] = {cat: idx for idx, cat in enumerate(df[cc].cat.categories)}
+            
+    with open(encoder_location, 'w') as f:
+        json.dump(jsondata, f)
+
+def sagemaker_train(train_config, data_config, train_path, val_path, model_dir, sm_hosts, sm_current_host,
+                    checkpoint_config):
+    metrics = metrics_mod.initialize()
+    hyperparameters = hpv.initialize(metrics)
+    
+    price_limit = int(train_config.get('bonitoo_price_limit', 1000))
+    train_config = {k:v.replace('"', '') for k,v in train_config.items() if not k.startswith('sagemaker_') and not k.startswith('bonitoo_')}
+    train_config = hyperparameters.validate(train_config)
+    
+    if train_config.get("updater"):
+        train_config["updater"] = ",".join(train_config["updater"])
+    
+    logging.info("hyperparameters {}".format(train_config))
+    logging.info("channels {}".format(data_config))
+
+    # Get Training and Validation Data Matrices
+    validation_channel = data_config.get('validation', None)
+    checkpoint_dir = checkpoint_config.get("LocalPath", None)
+
+    train_df, val_df = get_df(train_path, val_path)
+
+    train_df = preprocess_data(train_df)
+    cachetime_df = compute_cached_time(train_df)
+    train_label_df = train_df.apply(lambda x: expected_value(x, cachetime_df, price_limit), axis=1).to_frame(name='label')
+    train_df, train_df_orig = remove_non_features(train_df)
+    val_label_df = None
+
+    if val_df:
+        val_df = preprocess_data(val_df)
+        val_label_df = val_df.apply(lambda x: expected_value(x, cachetime_df, price_limit), axis=1).to_frame(name='label')
+        val_df, val_df_orig = remove_non_features(val_df)
+    train_dmatrix, val_dmatrix = get_dmatrices(train_df, train_label_df, val_df, val_label_df)
+
+    train_args = dict(
+        train_cfg=train_config,
+        train_dmatrix=train_dmatrix,
+        train_df=train_df_orig,
+        val_dmatrix=val_dmatrix,
+        model_dir=model_dir,
+        checkpoint_dir=checkpoint_dir)
+
+    # Obtain information about training resources to determine whether to set up Rabit or not
+    num_hosts = len(sm_hosts)
+
+    if num_hosts > 1:
+        # Wait for hosts to find each other
+        logging.info("Distributed node training with {} hosts: {}".format(num_hosts, sm_hosts))
+        distributed.wait_hostname_resolution(sm_hosts)
+
+        if not train_dmatrix:
+            logging.warning("Host {} does not have data. Will broadcast to cluster and will not be used in distributed"
+                            " training.".format(sm_current_host))
+        distributed.rabit_run(exec_fun=train_job, args=train_args, include_in_training=(train_dmatrix is not None),
+                              hosts=sm_hosts, current_host=sm_current_host, update_rabit_args=True)
+    elif num_hosts == 1:
+        if train_dmatrix:
+            if validation_channel:
+                if not val_dmatrix:
+                    raise exc.UserError("No data in validation channel path {}".format(val_path))
+            logging.info("Single node training.")
+            train_args.update({'is_master': True})
+            train_job(**train_args)
+        else:
+            raise exc.UserError("No data in training channel path {}".format(train_path))
+    else:
+        raise exc.PlatformError("Number of hosts should be an int greater than or equal to 1")
+
+        
+def train_job(train_cfg, train_dmatrix, val_dmatrix, train_df, model_dir, checkpoint_dir, is_master):
+    # Parse arguments for train() API
+    early_stopping_rounds = train_cfg.get('early_stopping_rounds')
+    num_round = int(train_cfg["num_round"])
+
+    # Evaluation metrics to use with train() API
+    tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric")
+    eval_metric = train_cfg.get("eval_metric")
+    cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval(
+        tuning_objective_metric_param, eval_metric)
+    if cleaned_eval_metric:
+        train_cfg['eval_metric'] = cleaned_eval_metric
+    else:
+        train_cfg.pop('eval_metric', None)
+
+    # Set callback evals
+    watchlist = [(train_dmatrix, 'train')]
+    if val_dmatrix is not None:
+        watchlist.append((val_dmatrix, 'validation'))
+
+    xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir)
+    num_round -= iteration
+    if xgb_model is not None:
+        logging.info("Checkpoint loaded from %s", xgb_model)
+        logging.info("Resuming from iteration %s", iteration)
+
+    callbacks = []
+    callbacks.append(checkpointing.print_checkpointed_evaluation(start_iteration=iteration))
+    if checkpoint_dir:
+        save_checkpoint = checkpointing.save_checkpoint(checkpoint_dir, start_iteration=iteration)
+        callbacks.append(save_checkpoint)
+
+    logging.info("Train matrix has {} rows".format(train_dmatrix.num_row()))
+    if val_dmatrix:
+        logging.info("Validation matrix has {} rows".format(val_dmatrix.num_row()))
+
+    # TODO remove
+    #logging.info("cols: %s", str(train_dmatrix.feature_names))
+    #raise Exception("cols: %s" % str(train_dmatrix.feature_names))
+        
+    try:
+        bst = xgb.train(train_cfg, train_dmatrix, num_boost_round=num_round, evals=watchlist, feval=configured_feval,
+                        early_stopping_rounds=early_stopping_rounds, callbacks=callbacks, xgb_model=xgb_model,
+                        verbose_eval=False)
+    except Exception as e:
+        for customer_error_message in CUSTOMER_ERRORS:
+            if customer_error_message in str(e):
+                raise exc.UserError(str(e))
+
+        exception_prefix = "XGB train call failed with exception"
+        raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))
+
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+
+    if is_master:
+        encoder_location = model_dir + '/encoder.json'
+        save_encoders(encoder_location, train_df)
+        logging.info("Stored encoders at {}".format(encoder_location))
+
+        model_location = model_dir + '/xgboost-model.bin'
+        bst.save_model(model_location)
+        logging.info("Stored trained model at {}".format(model_location))
+
+        
+
+if __name__ == '__main__':
+    with open(os.getenv(sm_env_constants.SM_INPUT_TRAINING_CONFIG_FILE), "r") as f:
+        train_config = json.load(f)
+    with open(os.getenv(sm_env_constants.SM_INPUT_DATA_CONFIG_FILE), "r") as f:
+        data_config = json.load(f)
+
+    checkpoint_config_file = os.getenv(sm_env_constants.SM_CHECKPOINT_CONFIG_FILE)
+    if os.path.exists(checkpoint_config_file):
+        with open(checkpoint_config_file, "r") as f:
+            checkpoint_config = json.load(f)
+    else:
+        checkpoint_config = {}
+
+    train_path = os.environ['SM_CHANNEL_TRAINING']
+    val_path = os.environ.get(sm_env_constants.SM_CHANNEL_VALIDATION)
+
+    sm_hosts = json.loads(os.environ[sm_env_constants.SM_HOSTS])
+    sm_current_host = os.environ[sm_env_constants.SM_CURRENT_HOST]
+
+    model_dir = os.getenv(sm_env_constants.SM_MODEL_DIR)
+
+    sagemaker_train(
+        train_config=train_config, data_config=data_config,
+        train_path=train_path, val_path=val_path, model_dir=model_dir,
+        sm_hosts=sm_hosts, sm_current_host=sm_current_host,
+        checkpoint_config=checkpoint_config
+        )
--- a/load.ipynb
+++ b/load.ipynb