Full set of fields

master
EHP 6 years ago
commit 76645f273d
  1. 115
      export.py
  2. 303
      external.ipynb
  3. 2
      src/requirements.txt
  4. 420
      src/train_model.py
  5. 155
      xgboost load.ipynb

@ -0,0 +1,115 @@
from pymongo import MongoClient
from pprint import pprint
from datetime import datetime
import csv
client = MongoClient()
db=client.bonitoo
fieldnames = [
'timestamp',
'client.channel',
'type',
'flight.inboundSegments.departure',
'flight.inboundSegments.arrival',
'flight.inboundSegments.origin.airportCode',
'flight.inboundSegments.destination.airportCode',
'flight.inboundSegments.flightNumber',
'flight.inboundSegments.travelClass',
'flight.inboundSegments.bookingCode',
'flight.inboundSegments.availability',
'flight.inboundSegments.elapsedFlyingTime',
'flight.outboundSegments.departure',
'flight.outboundSegments.arrival',
'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode',
'flight.outboundSegments.flightNumber',
'flight.outboundSegments.travelClass',
'flight.outboundSegments.bookingCode',
'flight.outboundSegments.availability',
'flight.outboundSegments.elapsedFlyingTime',
'flight.inboundEFT', # elapsed flying time
'flight.outboundEFT',
'oneWay',
'adults', # pocet osob = (adults + children)
'children',
'infants',
'input.price',
'input.tax',
'input.currency',
'success',
'status',
'output.price',
'output.tax',
'output.currency',
'duration' # delka volani do nadrazeneho systemu
]
# 5% nebo 200 kc rozdil nahoru
# -200 kc dolu
# abs(+-10kc) ignorovat
# timestamp + ok price - ma byt v cache od cacheat
# timestamp + notok price - nema byt v cache od cacheat
# delka pobytu prilet-odlet
# delka letu ?
# pokud je chyba tak nocache (= chybi priceout)
# brat v uvahu in/out kody aerolinek (mcx ?) - mirek jeste zjisti
# vypocitat uspesnost je/neni v cache v %
counter = 0
with open('export.csv', mode='w') as ef:
writer = csv.DictWriter(ef, fieldnames=fieldnames, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
# do not write header for s3 files
# writer.writeheader()
for it in db.pricing_audit.find():
counter += 1
if counter % 1000 == 0:
print('Iterace %d' % counter)
d = {
'timestamp': datetime.fromtimestamp(it['timestamp']/1000).isoformat(),
'client.channel': it['client']['channel'],
'type': it['type'],
'flight.outboundSegments.departure': '|'.join([x['departure'].isoformat() for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.arrival': '|'.join([x['arrival'].isoformat() for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.origin.airportCode': '|'.join([x['origin']['airportCode'] for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.destination.airportCode': '|'.join([x['destination']['airportCode'] for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.flightNumber': '|'.join([x['flightNumber'] for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.travelClass': '|'.join([x['travelClass'] for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.bookingCode': '|'.join([x.get('bookingCode','') for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.availability': '|'.join([str(x.get('availability','')) for x in it['flight']['outboundSegments']]),
'flight.outboundSegments.elapsedFlyingTime': '|'.join([str(x.get('elapsedFlyingTime','')) for x in it['flight']['outboundSegments']]),
'flight.inboundEFT': it['flight'].get('inboundEFT',''),
'flight.outboundEFT': it['flight'].get('outboundEFT',''),
'oneWay': it['oneWay'],
'adults': it['adults'],
'children': it['children'],
'infants': it['infants'],
'input.price': it['input']['price'],
'input.tax': it['input']['tax'],
'input.currency': it['input']['currency'],
'success': it['success'],
'status': it.get('status',''),
'output.price': it.get('output', {'price': 0})['price'],
'output.tax': it.get('output', {'tax': 0})['tax'],
'output.currency': it.get('output', {'currency': 0})['currency'],
'duration': it['duration']
}
if 'inboundSegments' in it['flight']:
inb = {
'flight.inboundSegments.departure': '|'.join([x['departure'].isoformat() for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.arrival': '|'.join([x['arrival'].isoformat() for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.origin.airportCode': '|'.join([x['origin']['airportCode'] for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.destination.airportCode': '|'.join([x['destination']['airportCode'] for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.flightNumber': '|'.join([x['flightNumber'] for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.travelClass': '|'.join([x['travelClass'] for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.bookingCode': '|'.join([x.get('bookingCode', '') for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.availability': '|'.join([str(x.get('availability','')) for x in it['flight']['inboundSegments']]),
'flight.inboundSegments.elapsedFlyingTime': '|'.join([str(x.get('elapsedFlyingTime','')) for x in it['flight']['inboundSegments']])
}
d = {**d, **inb}
writer.writerow(d)

@ -0,0 +1,303 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sagemaker\n",
"import boto3\n",
"from sagemaker import get_execution_role\n",
"\n",
"boto_session = boto3.Session(profile_name='bonitoo', region_name='eu-central-1')\n",
"sagemaker_session = sagemaker.LocalSession(boto_session=boto_session)\n",
"#sagemaker_session = sagemaker.Session()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Get a SageMaker-compatible role used by this Notebook Instance.\n",
"#role = get_execution_role()\n",
"role = 'Bonitoo_SageMaker_Execution'\n",
"region = boto_session.region_name"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_input = 's3://customers-bonitoo-cachettl/sagemaker/data/export.csv'"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"from sagemaker.xgboost.estimator import XGBoost\n",
"\n",
"tf = XGBoost(\n",
" entry_point='train_model.py',\n",
" source_dir='./src',\n",
" train_instance_type='local',\n",
" train_instance_count=1,\n",
" role=role,\n",
" sagemaker_session=sagemaker_session,\n",
" framework_version='0.90-1',\n",
" py_version='py3',\n",
" hyperparameters={\n",
" 'bonitoo_price_limit': 1000,\n",
" 'num_round': 15,\n",
" 'max_depth': 15,\n",
" 'eta': 0.5,\n",
" 'num_class': 8,\n",
" 'objective': 'multi:softmax',\n",
" 'eval_metric': 'mlogloss'\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating tmptao5hpuc_algo-1-x6dhm_1 ... \n",
"\u001b[1BAttaching to tmptao5hpuc_algo-1-x6dhm_12mdone\u001b[0m\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker_xgboost_container.training:Invoking user training script.\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Module train_model does not provide a setup.py. \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Generating setup.py\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Generating setup.cfg\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Generating MANIFEST.in\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Installing module with the following command:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/bin/python3 -m pip install . -r requirements.txt\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Processing /opt/ml/code\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: pandas in /usr/local/lib/python3.5/dist-packages (from -r requirements.txt (line 1)) (0.24.2)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: numpy in /usr/local/lib/python3.5/dist-packages (from -r requirements.txt (line 2)) (1.17.2)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2019.2)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: python-dateutil>=2.5.0 in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2.8.0)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.5/dist-packages (from python-dateutil>=2.5.0->pandas->-r requirements.txt (line 1)) (1.12.0)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Building wheels for collected packages: train-model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Building wheel for train-model (setup.py) ... \u001b[?25ldone\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \u001b[?25h Created wheel for train-model: filename=train_model-1.0.0-py2.py3-none-any.whl size=6578 sha256=f2f4bac7a2d0260f534e32b3ac0341fb291f30669499adf59ead09aa62b7ccc5\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Stored in directory: /tmp/pip-ephem-wheel-cache-vdfjugbr/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Successfully built train-model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Installing collected packages: train-model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Successfully installed train-model-1.0.0\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:sagemaker-containers:Invoking user script\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Training Env:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"network_interface_name\": \"eth0\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"hosts\": [\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"algo-1-x6dhm\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m ],\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"log_level\": 20,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"input_config_dir\": \"/opt/ml/input/config\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"framework_module\": \"sagemaker_xgboost_container.training:main\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"input_dir\": \"/opt/ml/input\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"channel_input_dirs\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"training\": \"/opt/ml/input/data/training\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_gpus\": 0,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"job_name\": \"sagemaker-xgboost-2019-10-05-20-16-58-398\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"user_entry_point\": \"train_model.py\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"current_host\": \"algo-1-x6dhm\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"module_dir\": \"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"master_hostname\": \"algo-1-x6dhm\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"module_name\": \"train_model\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"resource_config\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"current_host\": \"algo-1-x6dhm\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"hosts\": [\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"algo-1-x6dhm\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m ]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"additional_framework_parameters\": {},\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_cpus\": 6,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"output_data_dir\": \"/opt/ml/output/data\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"input_data_config\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"training\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"TrainingInputMode\": \"File\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m }\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"is_master\": true,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"hyperparameters\": {\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"bonitoo_price_limit\": 1000,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"max_depth\": 15,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"objective\": \"multi:softmax\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_class\": 8,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"eta\": 0.5,\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"eval_metric\": \"mlogloss\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"num_round\": 15\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m },\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"output_dir\": \"/opt/ml/output\",\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"model_dir\": \"/opt/ml/model\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m }\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Environment variables:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_INPUT_CONFIG_DIR=/opt/ml/input/config\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_MAX_DEPTH=15\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_LOG_LEVEL=20\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_OUTPUT_DIR=/opt/ml/output\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_NUM_CPUS=6\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_CHANNELS=[\"training\"]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_NUM_ROUND=15\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_OBJECTIVE=multi:softmax\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_OUTPUT_DATA_DIR=/opt/ml/output/data\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_FRAMEWORK_MODULE=sagemaker_xgboost_container.training:main\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_NETWORK_INTERFACE_NAME=eth0\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_INPUT_DATA_CONFIG={\"training\":{\"TrainingInputMode\":\"File\"}}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_FRAMEWORK_PARAMS={}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HPS={\"bonitoo_price_limit\":1000,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":15,\"objective\":\"multi:softmax\"}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m PYTHONPATH=/usr/local/bin:/:/usr/local/lib/python3.5/dist-packages/xgboost/dmlc-core/tracker:/usr/lib/python35.zip:/usr/lib/python3.5:/usr/lib/python3.5/plat-x86_64-linux-gnu:/usr/lib/python3.5/lib-dynload:/usr/local/lib/python3.5/dist-packages:/usr/lib/python3/dist-packages\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_RESOURCE_CONFIG={\"current_host\":\"algo-1-x6dhm\",\"hosts\":[\"algo-1-x6dhm\"]}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_NUM_GPUS=0\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_ETA=0.5\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_NUM_CLASS=8\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_MODULE_DIR=s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_USER_ARGS=[\"--bonitoo_price_limit\",\"1000\",\"--eta\",\"0.5\",\"--eval_metric\",\"mlogloss\",\"--max_depth\",\"15\",\"--num_class\",\"8\",\"--num_round\",\"15\",\"--objective\",\"multi:softmax\"]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_USER_ENTRY_POINT=train_model.py\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_CURRENT_HOST=algo-1-x6dhm\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_INPUT_DIR=/opt/ml/input\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_CHANNEL_TRAINING=/opt/ml/input/data/training\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_EVAL_METRIC=mlogloss\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_MODULE_NAME=train_model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HP_BONITOO_PRICE_LIMIT=1000\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_HOSTS=[\"algo-1-x6dhm\"]\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1-x6dhm\",\"framework_module\":\"sagemaker_xgboost_container.training:main\",\"hosts\":[\"algo-1-x6dhm\"],\"hyperparameters\":{\"bonitoo_price_limit\":1000,\"eta\":0.5,\"eval_metric\":\"mlogloss\",\"max_depth\":15,\"num_class\":8,\"num_round\":15,\"objective\":\"multi:softmax\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"training\":{\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"sagemaker-xgboost-2019-10-05-20-16-58-398\",\"log_level\":20,\"master_hostname\":\"algo-1-x6dhm\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-central-1-029917565482/sagemaker-xgboost-2019-10-05-20-16-58-398/source/sourcedir.tar.gz\",\"module_name\":\"train_model\",\"network_interface_name\":\"eth0\",\"num_cpus\":6,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1-x6dhm\",\"hosts\":[\"algo-1-x6dhm\"]},\"user_entry_point\":\"train_model.py\"}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m SM_MODEL_DIR=/opt/ml/model\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Invoking script with the following command:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/bin/python3 -m train_model --bonitoo_price_limit 1000 --eta 0.5 --eval_metric mlogloss --max_depth 15 --num_class 8 --num_round 15 --objective multi:softmax\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m ERROR:sagemaker-containers:ExecuteUserScriptError:\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Command \"/usr/bin/python3 -m train_model --bonitoo_price_limit 1000 --eta 0.5 --eval_metric mlogloss --max_depth 15 --num_class 8 --num_round 15 --objective multi:softmax\"\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:hyperparameters {'num_round': 15, 'num_class': 8, 'objective': 'multi:softmax', 'eta': 0.5, 'max_depth': 15, 'eval_metric': ['mlogloss']}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:channels {'training': {'TrainingInputMode': 'File'}}\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Determined delimiter of CSV input is ','\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Loading csv file export.csv\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Preprocessing start\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/pandas/core/indexing.py:543: SettingWithCopyWarning: \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m A value is trying to be set on a copy of a slice from a DataFrame.\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Try using .loc[row_indexer,col_indexer] = value instead\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m self.obj[item] = s\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/pandas/core/indexing.py:362: SettingWithCopyWarning: \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m A value is trying to be set on a copy of a slice from a DataFrame.\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Try using .loc[row_indexer,col_indexer] = value instead\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m self.obj[key] = _infer_fill_value(value)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Computing cached times\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Splitting dataset with ration 0.800000\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m if getattr(data, 'base', None) is not None and \\\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m /usr/local/lib/python3.5/dist-packages/xgboost/core.py:588: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m data.base is not None and isinstance(data, np.ndarray) \\\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Single node training.\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Train matrix has 25393 rows\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:Validation matrix has 6314 rows\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m INFO:root:cols: ['index', 'adults', 'children', 'infants', 'input.price', 'input.tax', 'status', 'output.price', 'output.tax', 'duration', 'client.channel_codes', 'type_codes', 'flight.inboundSegments.departure_codes', 'flight.inboundSegments.arrival_codes', 'flight.inboundSegments.origin.airportCode_codes', 'flight.inboundSegments.destination.airportCode_codes', 'flight.inboundSegments.flightNumber_codes', 'flight.inboundSegments.travelClass_codes', 'flight.inboundSegments.bookingCode_codes', 'flight.inboundSegments.availability_codes', 'flight.inboundSegments.elapsedFlyingTime_codes', 'flight.outboundSegments.departure_codes', 'flight.outboundSegments.arrival_codes', 'flight.outboundSegments.origin.airportCode_codes', 'flight.outboundSegments.destination.airportCode_codes', 'flight.outboundSegments.flightNumber_codes', 'flight.outboundSegments.travelClass_codes', 'flight.outboundSegments.bookingCode_codes', 'flight.outboundSegments.availability_codes', 'flight.outboundSegments.elapsedFlyingTime_codes', 'flight.inboundEFT_codes', 'flight.outboundEFT_codes', 'input.currency_codes', 'output.currency_codes', 'oneWay_codes']\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Traceback (most recent call last):\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/usr/lib/python3.5/runpy.py\", line 184, in _run_module_as_main\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m \"__main__\", mod_spec)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/usr/lib/python3.5/runpy.py\", line 85, in _run_code\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m exec(code, run_globals)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/opt/ml/code/train_model.py\", line 417, in <module>\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m checkpoint_config=checkpoint_config\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/opt/ml/code/train_model.py\", line 320, in sagemaker_train\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m train_job(**train_args)\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m File \"/opt/ml/code/train_model.py\", line 364, in train_job\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m raise Exception(\"cols: %s\" % str(train_dmatrix.feature_names))\n",
"\u001b[36malgo-1-x6dhm_1 |\u001b[0m Exception: cols: ['index', 'adults', 'children', 'infants', 'input.price', 'input.tax', 'status', 'output.price', 'output.tax', 'duration', 'client.channel_codes', 'type_codes', 'flight.inboundSegments.departure_codes', 'flight.inboundSegments.arrival_codes', 'flight.inboundSegments.origin.airportCode_codes', 'flight.inboundSegments.destination.airportCode_codes', 'flight.inboundSegments.flightNumber_codes', 'flight.inboundSegments.travelClass_codes', 'flight.inboundSegments.bookingCode_codes', 'flight.inboundSegments.availability_codes', 'flight.inboundSegments.elapsedFlyingTime_codes', 'flight.outboundSegments.departure_codes', 'flight.outboundSegments.arrival_codes', 'flight.outboundSegments.origin.airportCode_codes', 'flight.outboundSegments.destination.airportCode_codes', 'flight.outboundSegments.flightNumber_codes', 'flight.outboundSegments.travelClass_codes', 'flight.outboundSegments.bookingCode_codes', 'flight.outboundSegments.availability_codes', 'flight.outboundSegments.elapsedFlyingTime_codes', 'flight.inboundEFT_codes', 'flight.outboundEFT_codes', 'input.currency_codes', 'output.currency_codes', 'oneWay_codes']\n",
"\u001b[36mtmptao5hpuc_algo-1-x6dhm_1 exited with code 1\n",
"\u001b[0mAborting on container exit...\n"
]
},
{
"ename": "RuntimeError",
"evalue": "Failed to run: ['docker-compose', '-f', '/tmp/tmptao5hpuc/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 148\u001b[0;31m \u001b[0m_stream_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprocess\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 149\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36m_stream_output\u001b[0;34m(process)\u001b[0m\n\u001b[1;32m 656\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexit_code\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 657\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Process exited with code: %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mexit_code\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 658\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: Process exited with code: 1",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-51-ffc1ca8d95fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mestimator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'training'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtrain_input\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;31m#estimator = sklearn.attach('sagemaker-scikit-learn-2019-01-25-16-34-38-829')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_for_training\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_TrainingJob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart_new\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mstart_new\u001b[0;34m(cls, estimator, inputs)\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_add_spot_checkpoint_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlocal_mode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 863\u001b[0;31m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mtrain_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 864\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 865\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_current_job_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path)\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[0mLOGGER\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Creating training-job with name: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[0mLOGGER\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"train request: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_request\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 392\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_training_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mtrain_request\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 393\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 394\u001b[0m def compile_model(\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/local_session.py\u001b[0m in \u001b[0;36mcreate_training_job\u001b[0;34m(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0mtraining_job\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_LocalTrainingJob\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0mhyperparameters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"HyperParameters\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"HyperParameters\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 101\u001b[0;31m \u001b[0mtraining_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mInputDataConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mOutputDataConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhyperparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTrainingJobName\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0mLocalSagemakerClient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_training_jobs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mTrainingJobName\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtraining_job\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/entities.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m self.model_artifacts = self.container.train(\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0minput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhyperparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m )\n\u001b[1;32m 91\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/aprar/bonitoo/.venv/lib/python3.7/site-packages/sagemaker/local/image.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, input_data_config, output_data_config, hyperparameters, job_name)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[0;31m# which contains the exit code and append the command line to it.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Failed to run: %s, %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mcompose_command\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 153\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 154\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[0martifacts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve_artifacts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompose_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_data_config\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: Failed to run: ['docker-compose', '-f', '/tmp/tmptao5hpuc/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1"
]
}
],
"source": [
"estimator = tf.fit({'training': train_input})\n",
"#estimator = sklearn.attach('sagemaker-scikit-learn-2019-01-25-16-34-38-829')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,2 @@
pandas
numpy

@ -0,0 +1,420 @@
import os
import json
import logging
import argparse
import pandas as pd
import xgboost as xgb
import numpy as np
from sagemaker_algorithm_toolkit import exceptions as exc
from sagemaker_xgboost_container.constants import sm_env_constants
from sagemaker_xgboost_container.data_utils import get_content_type, get_dmatrix, get_size, validate_data_file_path
from sagemaker_xgboost_container import distributed
from sagemaker_xgboost_container import checkpointing
from sagemaker_xgboost_container.algorithm_mode import channel_validation as cv
from sagemaker_xgboost_container.algorithm_mode import hyperparameter_validation as hpv
from sagemaker_xgboost_container.algorithm_mode import metrics as metrics_mod
from sagemaker_xgboost_container.algorithm_mode import train_utils
from sagemaker_xgboost_container.constants.xgb_constants import CUSTOMER_ERRORS
columns = [
'timestamp',
'client.channel',
'type',
'flight.inboundSegments.departure',
'flight.inboundSegments.arrival',
'flight.inboundSegments.origin.airportCode',
'flight.inboundSegments.destination.airportCode',
'flight.inboundSegments.flightNumber',
'flight.inboundSegments.travelClass',
'flight.inboundSegments.bookingCode',
'flight.inboundSegments.availability',
'flight.inboundSegments.elapsedFlyingTime',
'flight.outboundSegments.departure',
'flight.outboundSegments.arrival',
'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode',
'flight.outboundSegments.flightNumber',
'flight.outboundSegments.travelClass',
'flight.outboundSegments.bookingCode',
'flight.outboundSegments.availability',
'flight.outboundSegments.elapsedFlyingTime',
'flight.inboundEFT',
'flight.outboundEFT',
'oneWay',
'adults',
'children',
'infants',
'input.price',
'input.tax',
'input.currency',
'success',
'status',
'output.price',
'output.tax',
'output.currency',
'duration'
]
catcolumns = [
'client.channel',
'type',
'flight.inboundSegments.departure',
'flight.inboundSegments.arrival',
'flight.inboundSegments.origin.airportCode',
'flight.inboundSegments.destination.airportCode',
'flight.inboundSegments.flightNumber',
'flight.inboundSegments.travelClass',
'flight.inboundSegments.bookingCode',
'flight.inboundSegments.availability',
'flight.inboundSegments.elapsedFlyingTime',
'flight.outboundSegments.departure',
'flight.outboundSegments.arrival',
'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode',
'flight.outboundSegments.flightNumber',
'flight.outboundSegments.travelClass',
'flight.outboundSegments.bookingCode',
'flight.outboundSegments.availability',
'flight.outboundSegments.elapsedFlyingTime',
'flight.inboundEFT',
'flight.outboundEFT',
'input.currency',
'output.currency',
'oneWay'
]
floatcolumns = [
'input.price',
'input.tax',
'output.price',
'output.tax'
]
intcolumns = [
'adults',
'children',
'infants',
'status',
'duration'
]
pkcolumns = [
'flight.inboundSegments.departure',
'flight.inboundSegments.origin.airportCode',
'flight.inboundSegments.destination.airportCode',
'flight.outboundSegments.departure',
'flight.outboundSegments.origin.airportCode',
'flight.outboundSegments.destination.airportCode'
]
def expected_value(row, cachetime_df, price_limit=1000):
# TODO sum tax + price ?
inprice, outprice = row['input.price'], row['output.price']
pricestatus = abs(inprice - outprice)
# TODO correct in cache time ?
timestamps = cachetime_df.loc[row[pkcolumns].fillna(''),'timestamp']
tdiff = timestamps['max'] - timestamps['min']
if tdiff.shape[0] > 0:
incachetime = tdiff[0]
else:
incachetime = np.timedelta64('NaT')
modifier = 0
if pricestatus > price_limit:
modifier = -1
if pricestatus < 0.1:
if not incachetime:
return 3
modifier = 1
if pd.isnull(incachetime):
return 0
if incachetime <= np.timedelta64(12,'h'):
return 1 + modifier
if incachetime <= np.timedelta64(1,'D'):
return 2 + modifier
if incachetime <= np.timedelta64(2,'D'):
return 3 + modifier
if incachetime <= np.timedelta64(3,'D'):
return 4 + modifier
if incachetime <= np.timedelta64(7,'D'):
return 5 + modifier
if incachetime <= np.timedelta64(14,'D'):
return 6 + modifier
return min(7, 7 + modifier)
def compute_cached_time(df):
logging.info('Computing cached times')
return df.set_index(pkcolumns).groupby(pkcolumns).agg({'timestamp': ['min', 'max']})
def preprocess_data(df):
logging.info('Preprocessing start')
df = df[df.loc[:, 'success'] == True]
df.loc[:, 'timestamp'] = df.loc[:, 'timestamp'].apply(lambda x: pd.to_datetime(x))
booleanDictionary = {True: 'TRUE', False: 'FALSE'}
df.loc[:, 'oneWay'] = df.loc[:, 'oneWay'].replace(booleanDictionary)
for cc in catcolumns:
df.loc[:, cc] = df.loc[:, cc].astype('category')
df.loc[:, '%s_codes' % cc] = df[cc].cat.codes
df.loc[:, floatcolumns] = df.loc[:, floatcolumns].astype('float64')
df.loc[:, intcolumns] = df.loc[:, intcolumns].fillna(-1).astype('int32')
return df
def remove_non_features(df):
return df.drop(['timestamp', 'success'] + catcolumns, axis=1), df
def train_test_split(df, label, ratio):
logging.info('Splitting dataset with ration %f', ratio)
msk = np.random.rand(len(df)) < ratio
train_data = df[msk]
test_data = df[~msk]
train_label = label[msk]
test_label = label[~msk]
train_data = train_data.reset_index()
test_data = test_data.reset_index()
train_label = train_label.reset_index()
test_label = test_label.reset_index()
return train_data, test_data, train_label, test_label
def get_csv_pandas(files_path):
csv_file = files_path if os.path.isfile(files_path) else [
f for f in os.listdir(files_path) if os.path.isfile(os.path.join(files_path, f))][0]
try:
logging.info('Loading csv file %s', csv_file)
df = pd.read_csv(os.path.join(files_path, csv_file), header=None)
df.columns = columns
#raise "cols: " + df.info()
return df
except Exception as e:
raise exc.UserError("Failed to load csv data with exception:\n{}".format(e))
def get_pandas_df(data_path):
if not os.path.exists(data_path):
return None
else:
if os.path.isfile(data_path):
files_path = data_path
else:
for root, dirs, files in os.walk(data_path):
if dirs == []:
files_path = root
break
df = get_csv_pandas(files_path)
return df
def get_df(train_path, validate_path, content_type='text/csv'):
train_files_size = get_size(train_path) if train_path else 0
val_files_size = get_size(validate_path) if validate_path else 0
logging.debug("File size need to be processed in the node: {}mb.".format(
round((train_files_size + val_files_size) / (1024 * 1024), 2)))
if train_files_size > 0:
validate_data_file_path(train_path, content_type)
if val_files_size > 0:
validate_data_file_path(validate_path, content_type)
train_pandas = get_pandas_df(train_path) if train_files_size > 0 else None
val_pandas = get_pandas_df(validate_path) if val_files_size > 0 else None
return train_pandas, val_pandas
def get_dmatrices(train_pandas, train_label_pandas, val_pandas, val_label_pandas, ratio=0.8):
if val_pandas:
train_dmatrix = xgb.DMatrix(train_pandas, label=train_label_pandas.loc[:, 'label'])
val_dmatrix = xgb.DMatrix(val_pandas, label=val_label_pandas.loc[:, 'label'])
else:
train_data, test_data, train_label, test_label = train_test_split(train_pandas, train_label_pandas, ratio)
train_dmatrix = xgb.DMatrix(train_data, label=train_label.loc[:, 'label'])
val_dmatrix = xgb.DMatrix(test_data, label=test_label.loc[:, 'label'])
return train_dmatrix, val_dmatrix
def save_encoders(encoder_location, df):
logging.info('Saving encoders')
jsondata = {}
for cc in catcolumns:
jsondata[cc] = {cat: idx for idx, cat in enumerate(df[cc].cat.categories)}
with open(encoder_location, 'w') as f:
json.dump(jsondata, f)
def sagemaker_train(train_config, data_config, train_path, val_path, model_dir, sm_hosts, sm_current_host,
checkpoint_config):
metrics = metrics_mod.initialize()
hyperparameters = hpv.initialize(metrics)
price_limit = int(train_config.get('bonitoo_price_limit', 1000))
train_config = {k:v.replace('"', '') for k,v in train_config.items() if not k.startswith('sagemaker_') and not k.startswith('bonitoo_')}
train_config = hyperparameters.validate(train_config)
if train_config.get("updater"):
train_config["updater"] = ",".join(train_config["updater"])
logging.info("hyperparameters {}".format(train_config))
logging.info("channels {}".format(data_config))
# Get Training and Validation Data Matrices
validation_channel = data_config.get('validation', None)
checkpoint_dir = checkpoint_config.get("LocalPath", None)
train_df, val_df = get_df(train_path, val_path)
train_df = preprocess_data(train_df)
cachetime_df = compute_cached_time(train_df)
train_label_df = train_df.apply(lambda x: expected_value(x, cachetime_df, price_limit), axis=1).to_frame(name='label')
train_df, train_df_orig = remove_non_features(train_df)
val_label_df = None
if val_df:
val_df = preprocess_data(val_df)
val_label_df = val_df.apply(lambda x: expected_value(x, cachetime_df, price_limit), axis=1).to_frame(name='label')
val_df, val_df_orig = remove_non_features(val_df)
train_dmatrix, val_dmatrix = get_dmatrices(train_df, train_label_df, val_df, val_label_df)
train_args = dict(
train_cfg=train_config,
train_dmatrix=train_dmatrix,
train_df=train_df_orig,
val_dmatrix=val_dmatrix,
model_dir=model_dir,
checkpoint_dir=checkpoint_dir)
# Obtain information about training resources to determine whether to set up Rabit or not
num_hosts = len(sm_hosts)
if num_hosts > 1:
# Wait for hosts to find each other
logging.info("Distributed node training with {} hosts: {}".format(num_hosts, sm_hosts))
distributed.wait_hostname_resolution(sm_hosts)
if not train_dmatrix:
logging.warning("Host {} does not have data. Will broadcast to cluster and will not be used in distributed"
" training.".format(sm_current_host))
distributed.rabit_run(exec_fun=train_job, args=train_args, include_in_training=(train_dmatrix is not None),
hosts=sm_hosts, current_host=sm_current_host, update_rabit_args=True)
elif num_hosts == 1:
if train_dmatrix:
if validation_channel:
if not val_dmatrix:
raise exc.UserError("No data in validation channel path {}".format(val_path))
logging.info("Single node training.")
train_args.update({'is_master': True})
train_job(**train_args)
else:
raise exc.UserError("No data in training channel path {}".format(train_path))
else:
raise exc.PlatformError("Number of hosts should be an int greater than or equal to 1")
def train_job(train_cfg, train_dmatrix, val_dmatrix, train_df, model_dir, checkpoint_dir, is_master):
# Parse arguments for train() API
early_stopping_rounds = train_cfg.get('early_stopping_rounds')
num_round = int(train_cfg["num_round"])
# Evaluation metrics to use with train() API
tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric")
eval_metric = train_cfg.get("eval_metric")
cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval(
tuning_objective_metric_param, eval_metric)
if cleaned_eval_metric:
train_cfg['eval_metric'] = cleaned_eval_metric
else:
train_cfg.pop('eval_metric', None)
# Set callback evals
watchlist = [(train_dmatrix, 'train')]
if val_dmatrix is not None:
watchlist.append((val_dmatrix, 'validation'))
xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir)
num_round -= iteration
if xgb_model is not None:
logging.info("Checkpoint loaded from %s", xgb_model)
logging.info("Resuming from iteration %s", iteration)
callbacks = []
callbacks.append(checkpointing.print_checkpointed_evaluation(start_iteration=iteration))
if checkpoint_dir:
save_checkpoint = checkpointing.save_checkpoint(checkpoint_dir, start_iteration=iteration)
callbacks.append(save_checkpoint)
logging.info("Train matrix has {} rows".format(train_dmatrix.num_row()))
if val_dmatrix:
logging.info("Validation matrix has {} rows".format(val_dmatrix.num_row()))
# TODO remove
#logging.info("cols: %s", str(train_dmatrix.feature_names))
#raise Exception("cols: %s" % str(train_dmatrix.feature_names))
try:
bst = xgb.train(train_cfg, train_dmatrix, num_boost_round=num_round, evals=watchlist, feval=configured_feval,
early_stopping_rounds=early_stopping_rounds, callbacks=callbacks, xgb_model=xgb_model,
verbose_eval=False)
except Exception as e:
for customer_error_message in CUSTOMER_ERRORS:
if customer_error_message in str(e):
raise exc.UserError(str(e))
exception_prefix = "XGB train call failed with exception"
raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))
if not os.path.exists(model_dir):
os.makedirs(model_dir)
if is_master:
encoder_location = model_dir + '/encoder.json'
save_encoders(encoder_location, train_df)
logging.info("Stored encoders at {}".format(encoder_location))
model_location = model_dir + '/xgboost-model.bin'
bst.save_model(model_location)
logging.info("Stored trained model at {}".format(model_location))
if __name__ == '__main__':
with open(os.getenv(sm_env_constants.SM_INPUT_TRAINING_CONFIG_FILE), "r") as f:
train_config = json.load(f)
with open(os.getenv(sm_env_constants.SM_INPUT_DATA_CONFIG_FILE), "r") as f:
data_config = json.load(f)
checkpoint_config_file = os.getenv(sm_env_constants.SM_CHECKPOINT_CONFIG_FILE)
if os.path.exists(checkpoint_config_file):
with open(checkpoint_config_file, "r") as f:
checkpoint_config = json.load(f)
else:
checkpoint_config = {}
train_path = os.environ['SM_CHANNEL_TRAINING']
val_path = os.environ.get(sm_env_constants.SM_CHANNEL_VALIDATION)
sm_hosts = json.loads(os.environ[sm_env_constants.SM_HOSTS])
sm_current_host = os.environ[sm_env_constants.SM_CURRENT_HOST]
model_dir = os.getenv(sm_env_constants.SM_MODEL_DIR)
sagemaker_train(
train_config=train_config, data_config=data_config,
train_path=train_path, val_path=val_path, model_dir=model_dir,
sm_hosts=sm_hosts, sm_current_host=sm_current_host,
checkpoint_config=checkpoint_config
)

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save