Computed features

master
EHP 6 years ago
parent d675ca673a
commit 6e4ffcc292
  1. 22
      export.py
  2. 347
      external.ipynb
  3. 97
      inference/src/main/java/cz/aprar/bonitoo/inference/CacheInference.java
  4. 74
      inference/src/main/java/cz/aprar/bonitoo/inference/FlightData.java
  5. 104
      inference/src/test/java/cz/aprar/bonitoo/inference/CacheInferenceTest.java
  6. 2
      inference/src/test/resources/model/encoder.json
  7. BIN
      inference/src/test/resources/model/xgboost-model.bin
  8. 11
      runner.py
  9. 136
      src/train_model.py
  10. 82
      xgboost load.ipynb

@ -13,19 +13,17 @@ fieldnames = [
'flight.inboundSegments.arrival', 'flight.inboundSegments.arrival',
'flight.inboundSegments.origin.airportCode', 'flight.inboundSegments.origin.airportCode',
'flight.inboundSegments.destination.airportCode', 'flight.inboundSegments.destination.airportCode',
'flight.inboundSegments.airline.code',
'flight.outboundSegments.departure', 'flight.outboundSegments.departure',
'flight.outboundSegments.arrival', 'flight.outboundSegments.arrival',
'flight.outboundSegments.origin.airportCode', 'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode', 'flight.outboundSegments.destination.airportCode',
'flight.outboundSegments.airline.code',
'input.price', 'input.price',
'input.tax',
'input.currency',
'success', 'success',
'status',
'output.price', 'output.price',
'output.tax', 'cacheAt',
'output.currency', 'cacheExp'
'duration'
] ]
counter = 0 counter = 0
@ -50,15 +48,13 @@ with open('export.csv', mode='w') as ef:
[x['origin']['airportCode'] for x in it['flight']['outboundSegments']]), [x['origin']['airportCode'] for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.destination.airportCode': '|'.join( 'flight.outboundSegments.destination.airportCode': '|'.join(
[x['destination']['airportCode'] for x in it['flight']['outboundSegments']]), [x['destination']['airportCode'] for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.airline.code': '|'.join(
[x['airline']['code'] for x in it['flight']['outboundSegments']]),
'input.price': it['input']['price'], 'input.price': it['input']['price'],
'input.tax': it['input']['tax'],
'input.currency': it['input']['currency'],
'success': it['success'], 'success': it['success'],
'status': it.get('status', ''),
'output.price': it.get('output', {'price': 0})['price'], 'output.price': it.get('output', {'price': 0})['price'],
'output.tax': it.get('output', {'tax': 0})['tax'], 'cacheAt': it.get('cacheAt').isoformat() if it.get('cacheAt', None) else '',
'output.currency': it.get('output', {'currency': 0})['currency'], 'cacheExp': it.get('cacheExp').isoformat() if it.get('cacheExp', None) else ''
'duration': it['duration']
} }
if 'inboundSegments' in it['flight']: if 'inboundSegments' in it['flight']:
@ -71,6 +67,8 @@ with open('export.csv', mode='w') as ef:
[x['origin']['airportCode'] for x in it['flight']['inboundSegments']]), [x['origin']['airportCode'] for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.destination.airportCode': '|'.join( 'flight.inboundSegments.destination.airportCode': '|'.join(
[x['destination']['airportCode'] for x in it['flight']['inboundSegments']]), [x['destination']['airportCode'] for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.airline.code': '|'.join(
[x['airline']['code'] for x in it['flight']['inboundSegments']]),
} }
d = {**d, **inb} d = {**d, **inb}
writer.writerow(d) writer.writerow(d)

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -17,7 +17,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -29,7 +29,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 33,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -38,7 +38,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 50, "execution_count": 77,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -54,19 +54,22 @@
" framework_version='0.90-1',\n", " framework_version='0.90-1',\n",
" py_version='py3',\n", " py_version='py3',\n",
" hyperparameters={\n", " hyperparameters={\n",
" 'bonitoo_price_limit': 1000,\n", " 'bonitoo_price_pos_abs': 1000,\n",
" 'num_round': 15,\n", " 'bonitoo_price_neg_abs': 200,\n",
" 'bonitoo_price_pos_perc': 0.05,\n",
" 'bonitoo_price_neg_perc': 0.05,\n",
" 'num_round': 20,\n",
" 'max_depth': 15,\n", " 'max_depth': 15,\n",
" 'eta': 0.5,\n", " 'eta': 0.5,\n",
" 'num_class': 8,\n", " 'num_class': 8,\n",
" 'objective': 'multi:softmax',\n", " 'objective': 'multi:softprob',\n",
" 'eval_metric': 'mlogloss'\n", " 'eval_metric': 'mlogloss'\n",
" })" " })"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 51, "execution_count": 78,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -75,194 +78,158 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Creating tmptao5hpuc_algo-1-x6dhm_1 ... \n", "Creating tmpsn7kurwo_algo-1-hibva_1 ... \n",
"\u001b[1BAttaching to tmptao5hpuc_algo-1-x6dhm_12mdone\u001b[0m\n", "\u001b[1BAttaching to tmpsn7kurwo_algo-1-hibva_12mdone\u001b[0m\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker_xgboost_container.training:Invoking user training script.\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker_xgboost_container.training:Invoking user training script.\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Module train_model does not provide a setup.py. \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:Module train_model does not provide a setup.py. \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Generating setup.py\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Generating setup.py\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Generating setup.cfg\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:Generating setup.cfg\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Generating MANIFEST.in\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:Generating MANIFEST.in\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Installing module with the following command:\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:Installing module with the following command:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/bin/python3 -m pip install . -r requirements.txt\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m /miniconda3/bin/python -m pip install . -r requirements.txt\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Processing /opt/ml/code\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Processing /opt/ml/code\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: pandas in /usr/local/lib/python3.5/dist-packages (from -r requirements.txt (line 1)) (0.24.2)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Requirement already satisfied: pandas in /miniconda3/lib/python3.7/site-packages (from -r requirements.txt (line 1)) (0.25.1)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: numpy in /usr/local/lib/python3.5/dist-packages (from -r requirements.txt (line 2)) (1.17.2)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Requirement already satisfied: numpy in /miniconda3/lib/python3.7/site-packages (from -r requirements.txt (line 2)) (1.17.2)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2019.2)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Requirement already satisfied: python-dateutil>=2.6.1 in /miniconda3/lib/python3.7/site-packages (from pandas->-r requirements.txt (line 1)) (2.8.0)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: python-dateutil>=2.5.0 in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2.8.0)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Requirement already satisfied: pytz>=2017.2 in /miniconda3/lib/python3.7/site-packages (from pandas->-r requirements.txt (line 1)) (2019.3)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.5/dist-packages (from python-dateutil>=2.5.0->pandas->-r requirements.txt (line 1)) (1.12.0)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Requirement already satisfied: six>=1.5 in /miniconda3/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas->-r requirements.txt (line 1)) (1.12.0)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Building wheels for collected packages: train-model\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Building wheels for collected packages: train-model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Building wheel for train-model (setup.py) ... \u001b[?25ldone\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Building wheel for train-model (setup.py) ... \u001b[?25ldone\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \u001b[?25h Created wheel for train-model: filename=train_model-1.0.0-py2.py3-none-any.whl size=6578 sha256=f2f4bac7a2d0260f534e32b3ac0341fb291f30669499adf59ead09aa62b7ccc5\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \u001b[?25h Created wheel for train-model: filename=train_model-1.0.0-py2.py3-none-any.whl size=12596 sha256=1e1372c49fcc19ef6d93ad652d2e5c79e5855068be011b19b2273a3aff1b098f\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Stored in directory: /tmp/pip-ephem-wheel-cache-vdfjugbr/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Stored in directory: /tmp/pip-ephem-wheel-cache-yjganydo/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Successfully built train-model\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Successfully built train-model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Installing collected packages: train-model\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Installing collected packages: train-model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Successfully installed train-model-1.0.0\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Successfully installed train-model-1.0.0\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Invoking user script\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m INFO:sagemaker-containers:Invoking user script\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Training Env:\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Training Env:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m {\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"network_interface_name\": \"eth0\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"additional_framework_parameters\": {},\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"hosts\": [\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"channel_input_dirs\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"algo-1-x6dhm\"\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"training\": \"/opt/ml/input/data/training\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m ],\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"log_level\": 20,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"current_host\": \"algo-1-hibva\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"input_config_dir\": \"/opt/ml/input/config\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"framework_module\": \"sagemaker_xgboost_container.training:main\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"framework_module\": \"sagemaker_xgboost_container.training:main\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"hosts\": [\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"input_dir\": \"/opt/ml/input\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"algo-1-hibva\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"channel_input_dirs\": {\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m ],\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"training\": \"/opt/ml/input/data/training\"\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"hyperparameters\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"bonitoo_price_pos_abs\": 1000,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_gpus\": 0,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"bonitoo_price_neg_abs\": 200,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"job_name\": \"sagemaker-xgboost-2019-10-05-20-16-58-398\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"bonitoo_price_pos_perc\": 0.05,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"bonitoo_price_neg_perc\": 0.05,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"user_entry_point\": \"train_model.py\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"num_round\": 20,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"current_host\": \"algo-1-x6dhm\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"max_depth\": 15,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"module_dir\": \"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"eta\": 0.5,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"master_hostname\": \"algo-1-x6dhm\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"num_class\": 8,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"module_name\": \"train_model\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"objective\": \"multi:softprob\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"resource_config\": {\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"eval_metric\": \"mlogloss\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"current_host\": \"algo-1-x6dhm\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"hosts\": [\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"input_config_dir\": \"/opt/ml/input/config\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"algo-1-x6dhm\"\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"input_data_config\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m ]\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"training\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"TrainingInputMode\": \"File\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"additional_framework_parameters\": {},\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m }\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_cpus\": 6,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"output_data_dir\": \"/opt/ml/output/data\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"input_dir\": \"/opt/ml/input\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"input_data_config\": {\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"is_master\": true,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"training\": {\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"job_name\": \"sagemaker-xgboost-2019-10-19-17-27-30-738\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"TrainingInputMode\": \"File\"\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"log_level\": 20,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m }\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"master_hostname\": \"algo-1-hibva\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"model_dir\": \"/opt/ml/model\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"is_master\": true,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"module_dir\": \"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-19-17-27-30-738/source/sourcedir.tar.gz\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"hyperparameters\": {\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"module_name\": \"train_model\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"bonitoo_price_limit\": 1000,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"network_interface_name\": \"eth0\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"max_depth\": 15,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"num_cpus\": 4,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"objective\": \"multi:softmax\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"num_gpus\": 0,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_class\": 8,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"output_data_dir\": \"/opt/ml/output/data\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"eta\": 0.5,\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"output_dir\": \"/opt/ml/output\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"eval_metric\": \"mlogloss\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_round\": 15\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"resource_config\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"current_host\": \"algo-1-hibva\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"output_dir\": \"/opt/ml/output\",\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"hosts\": [\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"model_dir\": \"/opt/ml/model\"\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"algo-1-hibva\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m }\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m ]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Environment variables:\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \"user_entry_point\": \"train_model.py\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m }\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_INPUT_CONFIG_DIR=/opt/ml/input/config\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_MAX_DEPTH=15\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m Environment variables:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_LOG_LEVEL=20\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_OUTPUT_DIR=/opt/ml/output\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HOSTS=[\"algo-1-hibva\"]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_NUM_CPUS=6\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_NETWORK_INTERFACE_NAME=eth0\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_CHANNELS=[\"training\"]\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HPS={\"bonitoo_price_neg_abs\":200,\"bonitoo_price_neg_perc\":0.05,\"bonitoo_price_pos_abs\":1000,\"bonitoo_price_pos_perc\":0.05,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":20,\"objective\":\"multi:softprob\"}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_NUM_ROUND=15\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_USER_ENTRY_POINT=train_model.py\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_OBJECTIVE=multi:softmax\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_FRAMEWORK_PARAMS={}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_OUTPUT_DATA_DIR=/opt/ml/output/data\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_RESOURCE_CONFIG={\"current_host\":\"algo-1-hibva\",\"hosts\":[\"algo-1-hibva\"]}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_FRAMEWORK_MODULE=sagemaker_xgboost_container.training:main\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_INPUT_DATA_CONFIG={\"training\":{\"TrainingInputMode\":\"File\"}}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_NETWORK_INTERFACE_NAME=eth0\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_OUTPUT_DATA_DIR=/opt/ml/output/data\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_INPUT_DATA_CONFIG={\"training\":{\"TrainingInputMode\":\"File\"}}\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_CHANNELS=[\"training\"]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_FRAMEWORK_PARAMS={}\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_CURRENT_HOST=algo-1-hibva\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HPS={\"bonitoo_price_limit\":1000,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":15,\"objective\":\"multi:softmax\"}\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_MODULE_NAME=train_model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m PYTHONPATH=/usr/local/bin:/:/usr/local/lib/python3.5/dist-packages/xgboost/dmlc-core/tracker:/usr/lib/python35.zip:/usr/lib/python3.5:/usr/lib/python3.5/plat-x86_64-linux-gnu:/usr/lib/python3.5/lib-dynload:/usr/local/lib/python3.5/dist-packages:/usr/lib/python3/dist-packages\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_LOG_LEVEL=20\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_RESOURCE_CONFIG={\"current_host\":\"algo-1-x6dhm\",\"hosts\":[\"algo-1-x6dhm\"]}\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_FRAMEWORK_MODULE=sagemaker_xgboost_container.training:main\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_NUM_GPUS=0\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_INPUT_DIR=/opt/ml/input\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_ETA=0.5\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_INPUT_CONFIG_DIR=/opt/ml/input/config\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_NUM_CLASS=8\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_OUTPUT_DIR=/opt/ml/output\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_MODULE_DIR=s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_NUM_CPUS=4\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_USER_ARGS=[\"--bonitoo_price_limit\",\"1000\",\"--eta\",\"0.5\",\"--eval_metric\",\"mlogloss\",\"--max_depth\",\"15\",\"--num_class\",\"8\",\"--num_round\",\"15\",\"--objective\",\"multi:softmax\"]\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_NUM_GPUS=0\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_USER_ENTRY_POINT=train_model.py\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_MODEL_DIR=/opt/ml/model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_CURRENT_HOST=algo-1-x6dhm\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_MODULE_DIR=s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-19-17-27-30-738/source/sourcedir.tar.gz\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_INPUT_DIR=/opt/ml/input\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1-hibva\",\"framework_module\":\"sagemaker_xgboost_container.training:main\",\"hosts\":[\"algo-1-hibva\"],\"hyperparameters\":{\"bonitoo_price_neg_abs\":200,\"bonitoo_price_neg_perc\":0.05,\"bonitoo_price_pos_abs\":1000,\"bonitoo_price_pos_perc\":0.05,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":20,\"objective\":\"multi:softprob\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"training\":{\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"sagemaker-xgboost-2019-10-19-17-27-30-738\",\"log_level\":20,\"master_hostname\":\"algo-1-hibva\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-19-17-27-30-738/source/sourcedir.tar.gz\",\"module_name\":\"train_model\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1-hibva\",\"hosts\":[\"algo-1-hibva\"]},\"user_entry_point\":\"train_model.py\"}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_CHANNEL_TRAINING=/opt/ml/input/data/training\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_USER_ARGS=[\"--bonitoo_price_neg_abs\",\"200\",\"--bonitoo_price_neg_perc\",\"0.05\",\"--bonitoo_price_pos_abs\",\"1000\",\"--bonitoo_price_pos_perc\",\"0.05\",\"--eta\",\"0.5\",\"--eval_metric\",\"mlogloss\",\"--max_depth\",\"15\",\"--num_class\",\"8\",\"--num_round\",\"20\",\"--objective\",\"multi:softprob\"]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_EVAL_METRIC=mlogloss\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_CHANNEL_TRAINING=/opt/ml/input/data/training\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_MODULE_NAME=train_model\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_BONITOO_PRICE_POS_ABS=1000\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_BONITOO_PRICE_LIMIT=1000\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_BONITOO_PRICE_NEG_ABS=200\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HOSTS=[\"algo-1-x6dhm\"]\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_BONITOO_PRICE_POS_PERC=0.05\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1-x6dhm\",\"framework_module\":\"sagemaker_xgboost_container.training:main\",\"hosts\":[\"algo-1-x6dhm\"],\"hyperparameters\":{\"bonitoo_price_limit\":1000,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":15,\"objective\":\"multi:softmax\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"training\":{\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"sagemaker-xgboost-2019-10-05-20-16-58-398\",\"log_level\":20,\"master_hostname\":\"algo-1-x6dhm\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\",\"module_name\":\"train_model\",\"network_interface_name\":\"eth0\",\"num_cpus\":6,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1-x6dhm\",\"hosts\":[\"algo-1-x6dhm\"]},\"user_entry_point\":\"train_model.py\"}\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_BONITOO_PRICE_NEG_PERC=0.05\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_MODEL_DIR=/opt/ml/model\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_NUM_ROUND=20\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_MAX_DEPTH=15\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Invoking script with the following command:\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_ETA=0.5\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_NUM_CLASS=8\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/bin/python3 -m train_model --bonitoo_price_limit 1000 --eta 0.5 --eval_metric mlogloss --max_depth 15 --num_class 8 --num_round 15 --objective multi:softmax\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_OBJECTIVE=multi:softprob\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m SM_HP_EVAL_METRIC=mlogloss\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n" "\u001b[36malgo-1-hibva_1 |\u001b[0m PYTHONPATH=/miniconda3/bin:/:/usr/local/lib/python3.5/dist-packages/xgboost/dmlc-core/tracker:/miniconda3/lib/python37.zip:/miniconda3/lib/python3.7:/miniconda3/lib/python3.7/lib-dynload:/miniconda3/lib/python3.7/site-packages\n",
"\u001b[36malgo-1-hibva_1 |\u001b[0m \n",
"\u001b[36malgo-1-hibva_1 |\u001b[0m Invoking script with the following command:\n",
"\u001b[36malgo-1-hibva_1 |\u001b[0m \n",
"\u001b[36malgo-1-hibva_1 |\u001b[0m /miniconda3/bin/python -m train_model --bonitoo_price_neg_abs 200 --bonitoo_price_neg_perc 0.05 --bonitoo_price_pos_abs 1000 --bonitoo_price_pos_perc 0.05 --eta 0.5 --eval_metric mlogloss --max_depth 15 --num_class 8 --num_round 20 --objective multi:softprob\n",
"\u001b[36malgo-1-hibva_1 |\u001b[0m \n",
"\u001b[36malgo-1-hibva_1 |\u001b[0m \n"
] ]
}, },
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m ERROR:sagemaker-containers:ExecuteUserScriptError:\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [0]\ttrain-mlogloss:0.848017\tvalidation-mlogloss:0.922091\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Command \"/usr/bin/python3 -m train_model --bonitoo_price_limit 1000 --eta 0.5 --eval_metric mlogloss --max_depth 15 --num_class 8 --num_round 15 --objective multi:softmax\"\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [1]\ttrain-mlogloss:0.578424\tvalidation-mlogloss:0.697124\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:hyperparameters {'num_round': 15, 'num_class': 8, 'objective': 'multi:softmax', 'eta': 0.5, 'max_depth': 15, 'eval_metric': ['mlogloss']}\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [2]\ttrain-mlogloss:0.419099\tvalidation-mlogloss:0.572552\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:channels {'training': {'TrainingInputMode': 'File'}}\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [3]\ttrain-mlogloss:0.31692\tvalidation-mlogloss:0.497591\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Determined delimiter of CSV input is ','\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [4]\ttrain-mlogloss:0.247843\tvalidation-mlogloss:0.450857\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Loading csv file export.csv\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [5]\ttrain-mlogloss:0.20313\tvalidation-mlogloss:0.42247\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Preprocessing start\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [6]\ttrain-mlogloss:0.171749\tvalidation-mlogloss:0.404928\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/pandas/core/indexing.py:543: SettingWithCopyWarning: \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [7]\ttrain-mlogloss:0.15009\tvalidation-mlogloss:0.393772\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m A value is trying to be set on a copy of a slice from a DataFrame.\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [8]\ttrain-mlogloss:0.133377\tvalidation-mlogloss:0.385623\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Try using .loc[row_indexer,col_indexer] = value instead\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [9]\ttrain-mlogloss:0.120209\tvalidation-mlogloss:0.378456\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [10]\ttrain-mlogloss:0.110155\tvalidation-mlogloss:0.374374\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [11]\ttrain-mlogloss:0.09938\tvalidation-mlogloss:0.36958\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m self.obj[item] = s\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [12]\ttrain-mlogloss:0.092882\tvalidation-mlogloss:0.366292\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/pandas/core/indexing.py:362: SettingWithCopyWarning: \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [13]\ttrain-mlogloss:0.085552\tvalidation-mlogloss:0.363469\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m A value is trying to be set on a copy of a slice from a DataFrame.\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [14]\ttrain-mlogloss:0.079976\tvalidation-mlogloss:0.363688\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Try using .loc[row_indexer,col_indexer] = value instead\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [15]\ttrain-mlogloss:0.075524\tvalidation-mlogloss:0.36325\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [16]\ttrain-mlogloss:0.069857\tvalidation-mlogloss:0.36269\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [17]\ttrain-mlogloss:0.065141\tvalidation-mlogloss:0.361854\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m self.obj[key] = _infer_fill_value(value)\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [18]\ttrain-mlogloss:0.062355\tvalidation-mlogloss:0.361638\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Computing cached times\n", "\u001b[36malgo-1-hibva_1 |\u001b[0m [19]\ttrain-mlogloss:0.060227\tvalidation-mlogloss:0.361047\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Splitting dataset with ration 0.800000\n", "\u001b[36mtmpsn7kurwo_algo-1-hibva_1 exited with code 0\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n", "\u001b[0mAborting on container exit...\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m if getattr(data, 'base', None) is not None and \\\n", "===== Job Complete =====\n"
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/xgboost/core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m data.base is not None and isinstance(data, np.ndarray) \\\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Single node training.\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Train matrix has 25393 rows\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Validation matrix has 6314 rows\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:cols: ['index', 'adults', 'children', 'infants', 'input.price', 'input.tax', 'status', 'output.price', 'output.tax', 'duration', 'client.channel_codes', 'type_codes', 'flight.inboundSegments.departure_codes', 'flight.inboundSegments.arrival_codes', 'flight.inboundSegments.origin.airportCode_codes', 'flight.inboundSegments.destination.airportCode_codes', 'flight.inboundSegments.flightNumber_codes', 'flight.inboundSegments.travelClass_codes', 'flight.inboundSegments.bookingCode_codes', 'flight.inboundSegments.availability_codes', 'flight.inboundSegments.elapsedFlyingTime_codes', 'flight.outboundSegments.departure_codes', 'flight.outboundSegments.arrival_codes', 'flight.outboundSegments.origin.airportCode_codes', 'flight.outboundSegments.destination.airportCode_codes', 'flight.outboundSegments.flightNumber_codes', 'flight.outboundSegments.travelClass_codes', 'flight.outboundSegments.bookingCode_codes', 'flight.outboundSegments.availability_codes', 'flight.outboundSegments.elapsedFlyingTime_codes', 'flight.inboundEFT_codes', 'flight.outboundEFT_codes', 'input.currency_codes', 'output.currency_codes', 'oneWay_codes']\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Traceback (most recent call last):\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/usr/lib/python3.5/runpy.py\", line 184, in _run_module_as_main\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"__main__\", mod_spec)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/usr/lib/python3.5/runpy.py\", line 85, in _run_code\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m exec(code, run_globals)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/opt/ml/code/train_model.py\", line 417, in <module>\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m checkpoint_config=checkpoint_config\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/opt/ml/code/train_model.py\", line 320, in sagemaker_train\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m train_job(**train_args)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/opt/ml/code/train_model.py\", line 364, in train_job\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m raise Exception(\"cols: %s\" % str(train_dmatrix.feature_names))\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Exception: cols: ['index', 'adults', 'children', 'infants', 'input.price', 'input.tax', 'status', 'output.price', 'output.tax', 'duration', 'client.channel_codes', 'type_codes', 'flight.inboundSegments.departure_codes', 'flight.inboundSegments.arrival_codes', 'flight.inboundSegments.origin.airportCode_codes', 'flight.inboundSegments.destination.airportCode_codes', 'flight.inboundSegments.flightNumber_codes', 'flight.inboundSegments.travelClass_codes', 'flight.inboundSegments.bookingCode_codes', 'flight.inboundSegments.availability_codes', 'flight.inboundSegments.elapsedFlyingTime_codes', 'flight.outboundSegments.departure_codes', 'flight.outboundSegments.arrival_codes', 'flight.outboundSegments.origin.airportCode_codes', 'flight.outboundSegments.destination.airportCode_codes', 'flight.outboundSegments.flightNumber_codes', 'flight.outboundSegments.travelClass_codes', 'flight.outboundSegments.bookingCode_codes', 'flight.outboundSegments.availability_codes', 'flight.outboundSegments.elapsedFlyingTime_codes', 'flight.inboundEFT_codes', 'flight.outboundEFT_codes', 'input.currency_codes', 'output.currency_codes', 'oneWay_codes']\n",
"\u001b[36mtmptao5hpuc_algo-1-x6dhm_1 exited with code 1\n",
"\u001b[0mAborting on container exit...\n"
]
},
{
"ename": "RuntimeError",
"evalue": "Failed to run: ['docker-compose', '-f', '/tmp/tmptao5hpuc/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 148\u001b[0;31m \u001b[0m_stream_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 149\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36m_stream_output\u001b[0;34m(process)\u001b[0m\n\u001b[1;32m 656\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexit_code\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 657\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Process exited with code: %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mexit_code\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 658\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: Process exited with code: 1",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-51-ffc1ca8d95fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mestimator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'training'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtrain_input\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;31m#estimator = sklearn.attach('sagemaker-scikit-learn-2019-01-25-16-34-38-829')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_for_training\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_TrainingJob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart_new\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mstart_new\u001b[0;34m(cls, estimator, inputs)\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_add_spot_checkpoint_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_mode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 863\u001b[0;31m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mtrain_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 864\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 865\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_current_job_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path)\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[0mLOGGER\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Creating training-job with name: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[0mLOGGER\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"train request: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_request\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 392\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_training_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mtrain_request\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 393\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 394\u001b[0m def compile_model(\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/local_session.py\u001b[0m in \u001b[0;36mcreate_training_job\u001b[0;34m(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0mtraining_job\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_LocalTrainingJob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0mhyperparameters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"HyperParameters\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"HyperParameters\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 101\u001b[0;31m \u001b[0mtraining_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mInputDataConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mOutputDataConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhyperparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTrainingJobName\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0mLocalSagemakerClient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_training_jobs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mTrainingJobName\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining_job\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/entities.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m self.model_artifacts = self.container.train(\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0minput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhyperparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m )\n\u001b[1;32m 91\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[0;31m# which contains the exit code and append the command line to it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Failed to run: %s, %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mcompose_command\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 153\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 154\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[0martifacts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompose_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: Failed to run: ['docker-compose', '-f', '/tmp/tmptao5hpuc/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1"
] ]
} }
], ],

@ -12,8 +12,12 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors;
public class CacheInference { public class CacheInference {
private static final TTL[] TTL_VALUES = TTL.values(); private static final TTL[] TTL_VALUES = TTL.values();
@ -37,45 +41,94 @@ public class CacheInference {
} }
public TTL cacheTTL(final FlightData data) throws XGBoostError { public TTL cacheTTL(final FlightData data) throws XGBoostError {
float[][] predicts = booster.predict(createMatrix(data)); return cacheTTL(data, ZonedDateTime.now());
return TTL_VALUES[(int) predicts[0][0]]; }
/**
* Method for backtesting
*/
public TTL cacheTTL(final FlightData data, final ZonedDateTime now) throws XGBoostError {
final float[] predicts = cacheTTLProbability(data, now);
return TTL_VALUES[argmax(predicts)];
}
public float[] cacheTTLProbability(final FlightData data) throws XGBoostError {
return cacheTTLProbability(data, ZonedDateTime.now());
}
/**
* Method for backtesting
*/
public float[] cacheTTLProbability(final FlightData data, final ZonedDateTime now) throws XGBoostError {
final DMatrix matrix = createMatrix(data, now);
float[][] predicts = booster.predict(matrix);
return predicts[0];
} }
private Booster loadModel(InputStream model) throws XGBoostError, IOException { private Booster loadModel(InputStream model) throws XGBoostError, IOException {
return XGBoost.loadModel(model); return XGBoost.loadModel(model);
} }
private DMatrix createMatrix(final FlightData data) throws XGBoostError { private DMatrix createMatrix(final FlightData data, final ZonedDateTime now) throws XGBoostError {
final float[] arr = new float[18]; final float[] arr = new float[17];
arr[0] = labels.get("client.channel").getOrDefault(data.getClientChannel(), 0); arr[0] = data.getInputPrice().floatValue();
arr[1] = labels.get("type").getOrDefault(data.getType(), 0); arr[1] = data.getSuccess() ? 1 : 0;
arr[2] = labels.get("flight.inboundSegments.departure").getOrDefault(joinList(data.getInboundDeparture()), 0); arr[2] = data.getOutputPrice().floatValue();
arr[3] = labels.get("flight.inboundSegments.arrival").getOrDefault(joinList(data.getInboundArrival()), 0); arr[3] = labels.get("client.channel").getOrDefault(data.getClientChannel(), 0);
arr[4] = labels.get("flight.inboundSegments.origin.airportCode").getOrDefault(joinList(data.getInboundOrigin()), 0); arr[4] = labels.get("type").getOrDefault(data.getType(), 0);
arr[5] = labels.get("flight.inboundSegments.destination.airportCode").getOrDefault(joinList(data.getInboundDestination()), 0); arr[5] = labels.get("flight.inboundSegments.departure").getOrDefault(joinTimestampList(data.getInboundDeparture()), 0);
arr[6] = labels.get("flight.outboundSegments.departure").getOrDefault(joinList(data.getOutboundDeparture()), 0); arr[6] = labels.get("flight.inboundSegments.arrival").getOrDefault(joinTimestampList(data.getInboundArrival()), 0);
arr[7] = labels.get("flight.outboundSegments.arrival").getOrDefault(joinList(data.getOutboundArrival()), 0); arr[7] = labels.get("flight.inboundSegments.origin.airportCode").getOrDefault(joinList(data.getInboundOrigin()), 0);
arr[8] = labels.get("flight.outboundSegments.origin.airportCode").getOrDefault(joinList(data.getOutboundOrigin()), 0); arr[8] = labels.get("flight.inboundSegments.airline.code").getOrDefault(joinList(data.getInboundAirlines()), 0);
arr[9] = labels.get("flight.outboundSegments.destination.airportCode").getOrDefault(joinList(data.getOutboundDestination()), 0); arr[9] = labels.get("flight.inboundSegments.destination.airportCode").getOrDefault(joinList(data.getInboundDestination()), 0);
arr[10] = data.getInputPrice().floatValue(); arr[10] = labels.get("flight.outboundSegments.departure").getOrDefault(joinTimestampList(data.getOutboundDeparture()), 0);
arr[11] = data.getInputTax().floatValue(); arr[11] = labels.get("flight.outboundSegments.arrival").getOrDefault(joinTimestampList(data.getOutboundArrival()), 0);
arr[12] = labels.get("input.currency").getOrDefault(data.getInputCurrency(), 0); arr[12] = labels.get("flight.outboundSegments.origin.airportCode").getOrDefault(joinList(data.getOutboundOrigin()), 0);
arr[13] = data.getStatus().floatValue(); arr[13] = labels.get("flight.outboundSegments.destination.airportCode").getOrDefault(joinList(data.getOutboundDestination()), 0);
arr[14] = data.getOutputPrice().floatValue(); arr[14] = labels.get("flight.outboundSegments.airline.code").getOrDefault(joinList(data.getOutboundAirlines()), 0);
arr[15] = data.getOutputTax().floatValue(); arr[15] = computeDuration(data.getInboundDeparture(), data.getOutboundDeparture());
arr[16] = labels.get("output.currency").getOrDefault(data.getOutputCurrency(), 0); arr[16] = computePrebooking(data.getOutboundDeparture(), now);
arr[17] = data.getDuration().floatValue();
return new DMatrix(arr, 1, arr.length); return new DMatrix(arr, 1, arr.length);
} }
private float computeDuration(final List<ZonedDateTime> indeparture, final List<ZonedDateTime> outdeparture) {
if (indeparture.isEmpty()) {
return 0;
}
return ChronoUnit.DAYS.between(outdeparture.get(0), indeparture.get(0));
}
private float computePrebooking(final List<ZonedDateTime> outdeparture, final ZonedDateTime now) {
return ChronoUnit.DAYS.between(now, outdeparture.get(0));
}
private Map<String, Map<String, Integer>> loadLabels(InputStream labels) throws IOException { private Map<String, Map<String, Integer>> loadLabels(InputStream labels) throws IOException {
final TypeReference<Map<String, Map<String, Integer>>> typeRef = new TypeReference<Map<String, Map<String, Integer>>>() { final TypeReference<Map<String, Map<String, Integer>>> typeRef = new TypeReference<Map<String, Map<String, Integer>>>() {
}; };
return mapper.readValue(labels, typeRef); return mapper.readValue(labels, typeRef);
} }
private String joinTimestampList(final List<ZonedDateTime> data) {
return data.stream().map(DateTimeFormatter.ISO_LOCAL_DATE_TIME::format).collect(Collectors.joining("|"));
}
private String joinList(final List<String> data) { private String joinList(final List<String> data) {
return String.join("|", data); return String.join("|", data);
} }
private int argmax(final float[] data) {
int idx = 0;
float max = Float.MIN_VALUE;
for (int i = 0; i < data.length; i++) {
if (data[i] > max) {
max = data[i];
idx = i;
}
}
return idx;
}
} }

@ -1,52 +1,48 @@
package cz.aprar.bonitoo.inference; package cz.aprar.bonitoo.inference;
import java.time.ZonedDateTime;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
public class FlightData { public class FlightData {
private final String clientChannel; private final String clientChannel;
private final String type; private final String type;
private final List<String> inboundDeparture; private final List<ZonedDateTime> inboundDeparture;
private final List<String> inboundArrival; private final List<ZonedDateTime> inboundArrival;
private final List<String> inboundOrigin; private final List<String> inboundOrigin;
private final List<String> inboundDestination; private final List<String> inboundDestination;
private final List<String> outboundDeparture; private final List<String> inboundAirlines;
private final List<String> outboundArrival; private final List<ZonedDateTime> outboundDeparture;
private final List<ZonedDateTime> outboundArrival;
private final List<String> outboundOrigin; private final List<String> outboundOrigin;
private final List<String> outboundDestination; private final List<String> outboundDestination;
private final List<String> outboundAirlines;
private final Double inputPrice; private final Double inputPrice;
private final Double inputTax; private final Boolean success;
private final String inputCurrency;
private final Integer status;
private final Double outputPrice; private final Double outputPrice;
private final Double outputTax;
private final String outputCurrency; public FlightData(final String clientChannel, final String type, final List<ZonedDateTime> inboundDeparture,
private final Integer duration; final List<ZonedDateTime> inboundArrival, final List<String> inboundOrigin,
final List<String> inboundDestination, final List<String> inboundAirlines,
public FlightData(final String clientChannel, final String type, final List<ZonedDateTime> outboundDeparture, final List<ZonedDateTime> outboundArrival,
final List<String> inboundDeparture, final List<String> inboundArrival, final List<String> inboundOrigin, final List<String> outboundOrigin, final List<String> outboundDestination,
final List<String> inboundDestination, final List<String> outboundDeparture, final List<String> outboundArrival, final List<String> outboundAirlines, final Double inputPrice, final Boolean success,
final List<String> outboundOrigin, final List<String> outboundDestination, final Double inputPrice, final Double outputPrice) {
final Double inputTax, final String inputCurrency, final Integer status, final Double outputPrice,
final Double outputTax, final String outputCurrency, final Integer duration) {
this.clientChannel = clientChannel; this.clientChannel = clientChannel;
this.type = type; this.type = type;
this.inboundDeparture = inboundDeparture; this.inboundDeparture = inboundDeparture;
this.inboundArrival = inboundArrival; this.inboundArrival = inboundArrival;
this.inboundOrigin = inboundOrigin; this.inboundOrigin = inboundOrigin;
this.inboundDestination = inboundDestination; this.inboundDestination = inboundDestination;
this.inboundAirlines = inboundAirlines;
this.outboundDeparture = outboundDeparture; this.outboundDeparture = outboundDeparture;
this.outboundArrival = outboundArrival; this.outboundArrival = outboundArrival;
this.outboundOrigin = outboundOrigin; this.outboundOrigin = outboundOrigin;
this.outboundDestination = outboundDestination; this.outboundDestination = outboundDestination;
this.outboundAirlines = outboundAirlines;
this.inputPrice = inputPrice; this.inputPrice = inputPrice;
this.inputTax = inputTax; this.success = success;
this.inputCurrency = inputCurrency;
this.status = status;
this.outputPrice = outputPrice; this.outputPrice = outputPrice;
this.outputTax = outputTax;
this.outputCurrency = outputCurrency;
this.duration = duration;
} }
public String getClientChannel() { public String getClientChannel() {
@ -57,11 +53,11 @@ public class FlightData {
return type; return type;
} }
public List<String> getInboundDeparture() { public List<ZonedDateTime> getInboundDeparture() {
return Collections.unmodifiableList(inboundDeparture); return Collections.unmodifiableList(inboundDeparture);
} }
public List<String> getInboundArrival() { public List<ZonedDateTime> getInboundArrival() {
return Collections.unmodifiableList(inboundArrival); return Collections.unmodifiableList(inboundArrival);
} }
@ -73,11 +69,11 @@ public class FlightData {
return Collections.unmodifiableList(inboundDestination); return Collections.unmodifiableList(inboundDestination);
} }
public List<String> getOutboundDeparture() { public List<ZonedDateTime> getOutboundDeparture() {
return Collections.unmodifiableList(outboundDeparture); return Collections.unmodifiableList(outboundDeparture);
} }
public List<String> getOutboundArrival() { public List<ZonedDateTime> getOutboundArrival() {
return Collections.unmodifiableList(outboundArrival); return Collections.unmodifiableList(outboundArrival);
} }
@ -93,31 +89,19 @@ public class FlightData {
return inputPrice; return inputPrice;
} }
public Double getInputTax() {
return inputTax;
}
public String getInputCurrency() {
return inputCurrency;
}
public Integer getStatus() {
return status;
}
public Double getOutputPrice() { public Double getOutputPrice() {
return outputPrice; return outputPrice;
} }
public Double getOutputTax() { public List<String> getInboundAirlines() {
return outputTax; return Collections.unmodifiableList(inboundAirlines);
} }
public String getOutputCurrency() { public List<String> getOutboundAirlines() {
return outputCurrency; return Collections.unmodifiableList(outboundAirlines);
} }
public Integer getDuration() { public Boolean getSuccess() {
return duration; return success;
} }
} }

@ -6,8 +6,11 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.io.IOException; import java.io.IOException;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import static java.util.Collections.emptyList; import static java.util.Collections.emptyList;
import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertEquals;
@ -34,23 +37,20 @@ public class CacheInferenceTest {
{new FlightData( {new FlightData(
"fly-me-to", "fly-me-to",
"WS", "WS",
toList("2020-05-09T23:59:00", "2020-05-10T10:30:00"), toTimestampList("2020-05-09T23:59:00", "2020-05-10T10:30:00"),
toList("2020-05-10T08:55:00", "2020-05-10T11:30:00"), toTimestampList("2020-05-10T08:55:00", "2020-05-10T11:30:00"),
toList("MCO", "FRA"), toList("MCO", "FRA"),
toList("FRA", "PRG"), toList("FRA", "PRG"),
toList("2020-05-01T09:50:00", "2020-05-01T11:55:00"), toList("LH", "LH"),
toList("2020-05-01T11:00:00", "2020-05-01T21:55:00"), toTimestampList("2020-05-01T09:50:00", "2020-05-01T11:55:00"),
toTimestampList("2020-05-01T11:00:00", "2020-05-01T21:55:00"),
toList("PRG", "FRA"), toList("PRG", "FRA"),
toList("FRA", "MCO"), toList("FRA", "MCO"),
toList("LH", "LH"),
39766.0, 39766.0,
19776.0, Boolean.TRUE,
"CZK", 39766.0
0, ), TTL.D2},
39766.0,
19776.0,
"CZK",
427
), TTL.D1},
{new FlightData( {new FlightData(
"fly-me-to", "fly-me-to",
"PYTON", "PYTON",
@ -58,78 +58,66 @@ public class CacheInferenceTest {
emptyList(), emptyList(),
emptyList(), emptyList(),
emptyList(), emptyList(),
toList("2019-12-18T05:45:00"), emptyList(),
toList("2019-12-18T08:05:00"), toTimestampList("2019-12-18T05:45:00"),
toTimestampList("2019-12-18T08:05:00"),
toList("KRK"), toList("KRK"),
toList("BVA"), toList("BVA"),
toList("FR"),
336.258, 336.258,
0.0, Boolean.TRUE,
"CZK", 336.258
0, ), TTL.D2},
336.258,
0.0,
"CZK",
2284
), TTL.D7},
{new FlightData( {new FlightData(
"levne", "levne",
"AVIA", "AVIA",
toList("2020-02-07T02:25:00", "2020-02-07T14:50:00"), toTimestampList("2020-02-07T02:25:00", "2020-02-07T14:50:00"),
toList("2020-02-07T13:10:00", "2020-02-07T16:55:00"), toTimestampList("2020-02-07T13:10:00", "2020-02-07T16:55:00"),
toList("LAX", "LHR"), toList("LAX", "LHR"),
toList("LHR", "PRG"), toList("LHR", "PRG"),
toList("2020-01-28T10:35:00", "2020-01-28T14:40:00"), toList("AA", "BA"),
toList("2020-01-28T12:45:00", "2020-01-29T01:45:00"), toTimestampList("2020-01-28T10:35:00", "2020-01-28T14:40:00"),
toTimestampList("2020-01-28T12:45:00", "2020-01-29T01:45:00"),
toList("PRG", "HEL"), toList("PRG", "HEL"),
toList("HEL", "LAX"), toList("HEL", "LAX"),
toList("AY", "AY"),
5971.77978, 5971.77978,
0.0, Boolean.TRUE,
"CZK", 15971.77978
0, ), TTL.D2},
15971.77978,
0.0,
"CZK",
551
), TTL.D7},
{new FlightData( {new FlightData(
"fly-me-to", "fly-me-to",
"HH", "HH",
toList("2019-11-01T16:30:00", "2019-11-01T23:35:00"), toTimestampList("2019-11-01T16:30:00", "2019-11-01T23:35:00"),
toList("2019-11-01T21:12:00", "2019-11-02T07:45:00"), toTimestampList("2019-11-01T21:12:00", "2019-11-02T07:45:00"),
toList("YVR", "YUL"), toList("YVR", "YUL"),
toList("YUL", "VIE"), toList("YUL", "VIE"),
toList("2019-10-18T08:10:00", "2019-10-18T11:30:00"), toList("LH", "LH"),
toList("2019-10-18T09:40:00", "2019-10-18T21:25:00"), toTimestampList("2019-10-18T08:10:00", "2019-10-18T11:30:00"),
toTimestampList("2019-10-18T09:40:00", "2019-10-18T21:25:00"),
toList("VIE", "FRA"), toList("VIE", "FRA"),
toList("FRA", "YVR"), toList("FRA", "YVR"),
toList("LH", "LH"),
17723.0, 17723.0,
7708.0, Boolean.TRUE,
"CZK", 17723.0
0, ), TTL.D14},
17723.0,
7708.0,
"CZK",
1786
), TTL.D1},
{new FlightData( {new FlightData(
"unknown", "unknown",
"unknown", "unknown",
toTimestampList(ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)),
toTimestampList(ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)),
toList("unknown"), toList("unknown"),
toList("unknown"), toList("unknown"),
toList("unknown"), toList("unknown"),
toTimestampList(ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)),
toTimestampList(ZonedDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)),
toList("unknown"), toList("unknown"),
toList("unknown"), toList("unknown"),
toList("unknown"), toList("unknown"),
toList("unknown"),
toList("unknown"),
0.0,
0.0,
"unknown",
0,
0.0, 0.0,
0.0, Boolean.FALSE,
"unknown", 0.0
0
), TTL.NOCACHE} ), TTL.NOCACHE}
}; };
} }
@ -137,4 +125,10 @@ public class CacheInferenceTest {
private List<String> toList(final String... data) { private List<String> toList(final String... data) {
return Arrays.asList(data); return Arrays.asList(data);
} }
private List<ZonedDateTime> toTimestampList(final String... data) {
return Arrays.stream(data)
.map(x -> x + "Z")
.map(x -> ZonedDateTime.parse(x, DateTimeFormatter.ISO_ZONED_DATE_TIME)).collect(Collectors.toList());
}
} }

File diff suppressed because one or more lines are too long

@ -9,7 +9,7 @@ boto_session = boto3.Session(profile_name='bonitoo', region_name='eu-central-1')
sagemaker_session = sagemaker.Session(boto_session=boto_session) sagemaker_session = sagemaker.Session(boto_session=boto_session)
role = 'Bonitoo_SageMaker_Execution' role = 'Bonitoo_SageMaker_Execution'
train_input = 's3://customers-bonitoo-cachettl/sagemaker/data/export-reduced.csv' train_input = 's3://customers-bonitoo-cachettl/sagemaker/data/export.csv'
tf = XGBoost( tf = XGBoost(
entry_point='train_model.py', entry_point='train_model.py',
@ -21,12 +21,15 @@ tf = XGBoost(
framework_version='0.90-1', framework_version='0.90-1',
py_version='py3', py_version='py3',
hyperparameters={ hyperparameters={
'bonitoo_price_limit': 1000, 'bonitoo_price_pos_abs': 1000,
'num_round': 15, 'bonitoo_price_neg_abs': 200,
'bonitoo_price_pos_perc': 0.05,
'bonitoo_price_neg_perc': 0.05,
'num_round': 20,
'max_depth': 15, 'max_depth': 15,
'eta': 0.5, 'eta': 0.5,
'num_class': 8, 'num_class': 8,
'objective': 'multi:softmax', 'objective': 'multi:softprob',
'eval_metric': 'mlogloss' 'eval_metric': 'mlogloss'
}) })

@ -5,6 +5,7 @@ import argparse
import pandas as pd import pandas as pd
import xgboost as xgb import xgboost as xgb
import numpy as np import numpy as np
import datetime as dt
from sagemaker_algorithm_toolkit import exceptions as exc from sagemaker_algorithm_toolkit import exceptions as exc
from sagemaker_xgboost_container.constants import sm_env_constants from sagemaker_xgboost_container.constants import sm_env_constants
@ -25,19 +26,17 @@ columns = [
'flight.inboundSegments.arrival', 'flight.inboundSegments.arrival',
'flight.inboundSegments.origin.airportCode', 'flight.inboundSegments.origin.airportCode',
'flight.inboundSegments.destination.airportCode', 'flight.inboundSegments.destination.airportCode',
'flight.inboundSegments.airline.code',
'flight.outboundSegments.departure', 'flight.outboundSegments.departure',
'flight.outboundSegments.arrival', 'flight.outboundSegments.arrival',
'flight.outboundSegments.origin.airportCode', 'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode', 'flight.outboundSegments.destination.airportCode',
'flight.outboundSegments.airline.code',
'input.price', 'input.price',
'input.tax',
'input.currency',
'success', 'success',
'status',
'output.price', 'output.price',
'output.tax', 'cacheAt',
'output.currency', 'cacheExp'
'duration'
] ]
catcolumns = [ catcolumns = [
@ -47,95 +46,112 @@ catcolumns = [
'flight.inboundSegments.arrival', 'flight.inboundSegments.arrival',
'flight.inboundSegments.origin.airportCode', 'flight.inboundSegments.origin.airportCode',
'flight.inboundSegments.destination.airportCode', 'flight.inboundSegments.destination.airportCode',
'flight.inboundSegments.airline.code',
'flight.outboundSegments.departure', 'flight.outboundSegments.departure',
'flight.outboundSegments.arrival', 'flight.outboundSegments.arrival',
'flight.outboundSegments.origin.airportCode', 'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode', 'flight.outboundSegments.destination.airportCode',
'input.currency', 'flight.outboundSegments.airline.code'
'output.currency'
] ]
floatcolumns = [ floatcolumns = [
'input.price', 'input.price',
'input.tax', 'output.price'
'output.price',
'output.tax'
] ]
intcolumns = [ timestampcolumns = [
'status', 'timestamp',
'duration' 'cacheAt',
'cacheExp'
] ]
pkcolumns = [ def excessive_price(inprice, outprice, price_pos_abs, price_neg_abs, price_pos_perc, price_neg_perc):
'flight.inboundSegments.departure', return outprice - inprice > price_pos_abs or \
'flight.inboundSegments.origin.airportCode', inprice - outprice > price_neg_abs or \
'flight.inboundSegments.destination.airportCode', outprice > inprice * (1.0 + price_pos_perc) or \
'flight.outboundSegments.departure', outprice < inprice * (1.0 - price_neg_perc)
'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode' def equal_price(inprice, outprice):
] return abs(inprice - outprice) < 10
def expected_value(row, price_pos_abs=200, price_neg_abs=100, price_pos_perc=0.05, price_neg_perc=0.05):
# do not cache errors
success = row['success']
if success == 0:
return 0
def expected_value(row, cachetime_df, price_limit=1000):
# TODO sum tax + price ?
inprice, outprice = row['input.price'], row['output.price'] inprice, outprice = row['input.price'], row['output.price']
pricestatus = abs(inprice - outprice) tstamp, cacheAt, cacheExp = row['timestamp'], row['cacheAt'], row['cacheExp']
# TODO correct in cache time ? if cacheAt:
timestamps = cachetime_df.loc[row[pkcolumns].fillna(''),'timestamp'] incachetime = tstamp - cacheAt
tdiff = timestamps['max'] - timestamps['min'] expcachetime = cacheExp - cacheAt
if tdiff.shape[0] > 0:
incachetime = tdiff[0]
else: else:
incachetime = np.timedelta64('NaT') incachetime = np.timedelta64('NaT')
modifier = 0 modifier = 0
if pricestatus > price_limit: if excessive_price(inprice, outprice, price_pos_abs, price_neg_abs, price_pos_perc, price_neg_perc):
modifier = -1 modifier = -1
if pricestatus < 0.1: expcachetime = incachetime
if not incachetime: if equal_price(inprice, outprice):
if pd.isnull(incachetime):
return 3 return 3
modifier = 1 modifier = 1
if pd.isnull(incachetime): if pd.isnull(incachetime):
return 0 return 1
if incachetime <= np.timedelta64(12,'h'): if expcachetime <= np.timedelta64(12,'h'):
return 1 + modifier return 1 + modifier
if incachetime <= np.timedelta64(1,'D'): if expcachetime <= np.timedelta64(1,'D'):
return 2 + modifier return 2 + modifier
if incachetime <= np.timedelta64(2,'D'): if expcachetime <= np.timedelta64(2,'D'):
return 3 + modifier return 3 + modifier
if incachetime <= np.timedelta64(3,'D'): if expcachetime <= np.timedelta64(3,'D'):
return 4 + modifier return 4 + modifier
if incachetime <= np.timedelta64(7,'D'): if expcachetime <= np.timedelta64(7,'D'):
return 5 + modifier return 5 + modifier
if incachetime <= np.timedelta64(14,'D'): if expcachetime <= np.timedelta64(14,'D'):
return 6 + modifier return 6 + modifier
return min(7, 7 + modifier) return min(7, 7 + modifier)
def compute_cached_time(df): def compute_duration(row):
logging.info('Computing cached times') indeparture, outdeparture = row['flight.inboundSegments.departure'], row['flight.outboundSegments.departure']
return df.set_index(pkcolumns).groupby(pkcolumns).agg({'timestamp': ['min', 'max']}) if pd.isna(indeparture):
return 0
else:
indt = dt.datetime.fromisoformat(indeparture.split('|')[0])
outdt = dt.datetime.fromisoformat(outdeparture.split('|')[0])
return (indt - outdt).days
def compute_prebooking(row):
tstamp, outdeparture = row['timestamp'], row['flight.outboundSegments.departure']
outdt = dt.datetime.fromisoformat(outdeparture.split('|')[0])
return (outdt - tstamp).days
def preprocess_data(df): def preprocess_data(df):
logging.info('Preprocessing start') logging.info('Preprocessing start')
df = df[df.loc[:, 'success'] == True]
df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].apply(lambda x: pd.to_datetime(x)) booleanDictionary = {True: 1, False: 0}
df.loc[:, 'success'] = df.loc[:, 'success'].replace(booleanDictionary)
for ct in timestampcolumns:
df.loc[:, ct] = df.loc[:, ct].apply(lambda x: pd.to_datetime(x))
for cc in catcolumns: for cc in catcolumns:
df.loc[:, cc] = df.loc[:, cc].astype('category') df.loc[:, cc] = df.loc[:, cc].astype('category')
df.loc[:, '%s_codes' % cc] = df[cc].cat.codes df.loc[:, '%s_codes' % cc] = df[cc].cat.codes
df.loc[:, floatcolumns] = df.loc[:, floatcolumns].astype('float64') df.loc[:, floatcolumns] = df.loc[:, floatcolumns].astype('float64')
df.loc[:, intcolumns] = df.loc[:, intcolumns].fillna(-1).astype('int32')
df.loc[:, 'duration'] = df.apply(lambda x: compute_duration(x), axis=1)
df.loc[:, 'prebooking'] = df.apply(lambda x: compute_prebooking(x), axis=1)
return df return df
def remove_non_features(df): def remove_non_features(df):
return df.drop(['timestamp', 'success'] + catcolumns, axis=1), df return df.drop(catcolumns + timestampcolumns, axis=1), df
def train_test_split(df, label, ratio): def train_test_split(df, label, ratio):
logging.info('Splitting dataset with ration %f', ratio) logging.info('Splitting dataset with ration %f', ratio)
@ -162,7 +178,6 @@ def get_csv_pandas(files_path):
df = pd.read_csv(os.path.join(files_path, csv_file), header=None) df = pd.read_csv(os.path.join(files_path, csv_file), header=None)
df.columns = columns df.columns = columns
#raise "cols: " + df.info()
return df return df
except Exception as e: except Exception as e:
@ -183,6 +198,7 @@ def get_pandas_df(data_path):
return df return df
def get_df(train_path, validate_path, content_type='text/csv'): def get_df(train_path, validate_path, content_type='text/csv'):
train_files_size = get_size(train_path) if train_path else 0 train_files_size = get_size(train_path) if train_path else 0
val_files_size = get_size(validate_path) if validate_path else 0 val_files_size = get_size(validate_path) if validate_path else 0
@ -217,7 +233,7 @@ def save_encoders(encoder_location, df):
jsondata = {} jsondata = {}
for cc in catcolumns: for cc in catcolumns:
jsondata[cc] = {cat: idx for idx, cat in enumerate(df[cc].cat.categories)} jsondata[cc] = {cat: idx for idx, cat in enumerate(df[cc].cat.categories)}
with open(encoder_location, 'w') as f: with open(encoder_location, 'w') as f:
json.dump(jsondata, f) json.dump(jsondata, f)
@ -225,14 +241,18 @@ def sagemaker_train(train_config, data_config, train_path, val_path, model_dir,
checkpoint_config): checkpoint_config):
metrics = metrics_mod.initialize() metrics = metrics_mod.initialize()
hyperparameters = hpv.initialize(metrics) hyperparameters = hpv.initialize(metrics)
price_limit = int(train_config.get('bonitoo_price_limit', 1000)) price_pos_abs = int(train_config.get('bonitoo_price_pos_abs', 200))
price_neg_abs = int(train_config.get('bonitoo_price_neg_abs', 200))
price_pos_perc = float(train_config.get('bonitoo_price_pos_perc', 0.05))
price_neg_perc = float(train_config.get('bonitoo_price_neg_perc', 0.05))
train_config = {k:v.replace('"', '') for k,v in train_config.items() if not k.startswith('sagemaker_') and not k.startswith('bonitoo_')} train_config = {k:v.replace('"', '') for k,v in train_config.items() if not k.startswith('sagemaker_') and not k.startswith('bonitoo_')}
train_config = hyperparameters.validate(train_config) train_config = hyperparameters.validate(train_config)
if train_config.get("updater"): if train_config.get("updater"):
train_config["updater"] = ",".join(train_config["updater"]) train_config["updater"] = ",".join(train_config["updater"])
logging.info("hyperparameters {}".format(train_config)) logging.info("hyperparameters {}".format(train_config))
logging.info("channels {}".format(data_config)) logging.info("channels {}".format(data_config))
@ -243,14 +263,13 @@ def sagemaker_train(train_config, data_config, train_path, val_path, model_dir,
train_df, val_df = get_df(train_path, val_path) train_df, val_df = get_df(train_path, val_path)
train_df = preprocess_data(train_df) train_df = preprocess_data(train_df)
cachetime_df = compute_cached_time(train_df) train_label_df = train_df.apply(lambda x: expected_value(x, price_pos_abs, price_neg_abs, price_pos_perc, price_neg_perc), axis=1).to_frame(name='label')
train_label_df = train_df.apply(lambda x: expected_value(x, cachetime_df, price_limit), axis=1).to_frame(name='label')
train_df, train_df_orig = remove_non_features(train_df) train_df, train_df_orig = remove_non_features(train_df)
val_label_df = None val_label_df = None
if val_df: if val_df:
val_df = preprocess_data(val_df) val_df = preprocess_data(val_df)
val_label_df = val_df.apply(lambda x: expected_value(x, cachetime_df, price_limit), axis=1).to_frame(name='label') val_label_df = val_df.apply(lambda x: expected_value(x, cachetime_df, price_pos_abs, price_neg_abs, price_pos_perc, price_neg_perc), axis=1).to_frame(name='label')
val_df, val_df_orig = remove_non_features(val_df) val_df, val_df_orig = remove_non_features(val_df)
train_dmatrix, val_dmatrix = get_dmatrices(train_df, train_label_df, val_df, val_label_df) train_dmatrix, val_dmatrix = get_dmatrices(train_df, train_label_df, val_df, val_label_df)
@ -288,6 +307,7 @@ def sagemaker_train(train_config, data_config, train_path, val_path, model_dir,
else: else:
raise exc.PlatformError("Number of hosts should be an int greater than or equal to 1") raise exc.PlatformError("Number of hosts should be an int greater than or equal to 1")
def train_job(train_cfg, train_dmatrix, val_dmatrix, train_df, model_dir, checkpoint_dir, is_master): def train_job(train_cfg, train_dmatrix, val_dmatrix, train_df, model_dir, checkpoint_dir, is_master):
# Parse arguments for train() API # Parse arguments for train() API
early_stopping_rounds = train_cfg.get('early_stopping_rounds') early_stopping_rounds = train_cfg.get('early_stopping_rounds')

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save