Source code for dlmfg.kmc_gen.kmc_model

""" Feature importance of ensemble models such as Gradient Boosted Trees or Random Forests is used in determining KMCs for each KCC
"""

import os
import sys
current_path=os.path.dirname(__file__)
parentdir = os.path.dirname(current_path)

#Adding Path to various Modules
sys.path.append("../core")
sys.path.append("../visualization")
sys.path.append("../utilities")
sys.path.append("../datasets")
sys.path.append("../trained_models")
sys.path.append("../config")


#from sklearn import RandomForestRegressor
import pathlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
from sklearn.externals import joblib

#Importing Config files
import assembly_config as config
import model_config as cftrain

#Importing required modules from the package
from measurement_system import HexagonWlsScanner
from assembly_system import VRMSimulationModel
from data_import import GetTrainData
import voxel_config as vc
from cop_viz import CopViz


[docs]def kmc_model_build(tree_based_model,point_data,selected_kcc,kcc_name,split_ratio=0.2,save_model=0): """kmc_model_build function inputs model_type and data to generate KMC for each KCC :param tree_based_model: Type of model to be used for feature importance :type tree_based_model: str (required) :param point_data: input data consisting of node deviations :type point_data: numpy.array (samples*nodes) (required) :param selected_kcc: output data consisting of selected process parameter/KCC :type selected_kcc: numpy.array (samples*1) (required) :param kcc_name: unique identifier for the KCC :type kcc_name: str (required) :param split_ratio: test data split :type split_ratio: float :param save_model: Save model flag, set to 1 to save model :type save_model: int :returns: filtered_nodeIDs_x, node ids for which x-deviation is significant given the kcc :rtype: numpy.array [kmcs*1] :returns: filtered_nodeIDs_y, node ids for which y-deviation is significant given the kcc :rtype: numpy.array [kmcs*1] :returns: filtered_nodeIDs_z, node ids for which z-deviation is significant given the kcc :rtype: numpy.array [kmcs*1] """ train_X, test_X, train_y, test_y = train_test_split(point_data, selected_kcc, test_size = 0.2) train=train_X target=train_y train.index=range(0,train.shape[0]) target.index=range(0,train.shape[0]) #%% print('KMC Generation for selected :', kcc_name) if(tree_based_model=='rf'): model=RandomForestRegressor(n_estimators=500,max_depth=300,n_jobs=-1,verbose=True) if(tree_based_model=='xgb'): model=xgb.XGBRegressor(colsample_bytree=0.4,gamma=0.045,learning_rate=0.07,max_depth=500,min_child_weight=1.5,n_estimators=150,reg_alpha=0.65,reg_lambda=0.45,subsample=0.95,n_jobs=-1,verbose=True) model.fit(train,target) #%% y_pred = model.predict(test_X) mae=metrics.mean_absolute_error(test_y, y_pred) print('The MAE for feature selection for: ',kcc_name) print(mae) if(save_model==1): filename = kcc_name+'_XGB_model.sav' joblib.dump(model, filename) print('Trained Model Saved to disk....') thresholds = model.feature_importances_ node_id=np.arange(start=1, stop=point_data.shape[1]+1, step=1) threshold_data=np.zeros((point_data.shape[1],2)) threshold_data[:,0]=node_id threshold_data[:,1]=thresholds print(point_data.shape[1]) node_IDs = pd.DataFrame(data=threshold_data,columns=['node_id','Feature_Importance']) node_IDs = node_IDs.sort_values('Feature_Importance', ascending=False) filtered_nodeIDs=node_IDs.loc[node_IDs['Feature_Importance'] >= 0.01] filtered_nodeIDs_x=filtered_nodeIDs.loc[filtered_nodeIDs['node_id']<=point_data.shape[1]/3] filtered_nodeIDs_y=filtered_nodeIDs.loc[(filtered_nodeIDs['node_id']>point_data.shape[1]/3) & (filtered_nodeIDs['node_id']<=point_data.shape[1]*2/3)] filtered_nodeIDs_z=filtered_nodeIDs.loc[(filtered_nodeIDs['node_id']>point_data.shape[1]*2/3) & (filtered_nodeIDs['node_id']<=point_data.shape[1])] return filtered_nodeIDs_x, filtered_nodeIDs_y, filtered_nodeIDs_z
if __name__ == '__main__': print('Parsing from Assembly Config File....') data_type=config.assembly_system['data_type'] application=config.assembly_system['application'] part_type=config.assembly_system['part_type'] part_name=config.assembly_system['part_name'] data_format=config.assembly_system['data_format'] assembly_type=config.assembly_system['assembly_type'] assembly_kccs=config.assembly_system['assembly_kccs'] assembly_kpis=config.assembly_system['assembly_kpis'] voxel_dim=config.assembly_system['voxel_dim'] point_dim=config.assembly_system['point_dim'] voxel_channels=config.assembly_system['voxel_channels'] noise_type=config.assembly_system['noise_type'] mapping_index=config.assembly_system['mapping_index'] file_names_x=config.assembly_system['data_files_x'] file_names_y=config.assembly_system['data_files_y'] file_names_z=config.assembly_system['data_files_z'] system_noise=config.assembly_system['system_noise'] aritifical_noise=config.assembly_system['aritifical_noise'] data_folder=config.assembly_system['data_folder'] kcc_folder=config.assembly_system['kcc_folder'] kcc_files=config.assembly_system['kcc_files'] print('Parsing from Training Config File') tree_based_model=cftrain.kmc_params['tree_based_model'] importance_creteria=cftrain.kmc_params['importance_creteria'] save_model=cftrain.kmc_params['save_model'] split_ratio=cftrain.kmc_params['split_ratio'] plot_kmc=cftrain.kmc_params['plot_kmc'] print('Creating file Structure....') folder_name=part_type train_path='../trained_models/'+part_type pathlib.Path(train_path).mkdir(parents=True, exist_ok=True) kmc_path=train_path+'/kmc' pathlib.Path(kmc_path).mkdir(parents=True, exist_ok=True) kmc_plot_path=kmc_path+'/plots' pathlib.Path(kmc_plot_path).mkdir(parents=True, exist_ok=True) print('Initializing the Assembly System and Measurement System....') measurement_system=HexagonWlsScanner(data_type,application,system_noise,part_type,data_format) vrm_system=VRMSimulationModel(assembly_type,assembly_kccs,assembly_kpis,part_name,part_type,voxel_dim,voxel_channels,point_dim,aritifical_noise) get_data=GetTrainData() print('Importing and preprocessing Cloud-of-Point Data') dataset=[] dataset.append((get_data.data_import(file_names_x,data_folder)).iloc[:,0:point_dim]) dataset.append((get_data.data_import(file_names_y,data_folder)).iloc[:,0:point_dim]) dataset.append((get_data.data_import(file_names_z,data_folder)).iloc[:,0:point_dim]) kcc_dataset=get_data.data_import(kcc_files,kcc_folder) point_index=get_data.load_mapping_index(mapping_index) point_data=pd.concat([dataset[0],dataset[1],dataset[2]],axis=1,ignore_index=True) kcc_id=[] kmc_list_x=[] kmc_list_y=[] kmc_list_z=[] cop_file_name=vc.voxel_parameters['nominal_cop_filename'] file_path='../resources/nominal_cop_files/'+cop_file_name #Read cop from csv file print('Importing Nominal COP') nominal_cop=vrm_system.get_nominal_cop(file_path) copviz=CopViz(nominal_cop) print('Generating KMC for all KCCs...') for i in range(assembly_kccs): kcc_name="KCC_"+str(i+1) kcc_id.append(kcc_name) selected_kcc=kcc_dataset.iloc[:,i:i+1] filtered_nodeIDs_x,filtered_nodeIDs_y,filtered_nodeIDs_z=kmc_model_build(tree_based_model,point_data,selected_kcc,kcc_name,split_ratio,save_model) print(point_dim) filtered_nodeIDs_y['node_id']=filtered_nodeIDs_y['node_id']-point_dim filtered_nodeIDs_z['node_id']=filtered_nodeIDs_z['node_id']-(2*point_dim) filtered_nodeIDs_x['node_id']=filtered_nodeIDs_x['node_id'].astype('int') filtered_nodeIDs_y['node_id']=filtered_nodeIDs_y['node_id'].astype('int') filtered_nodeIDs_z['node_id']=filtered_nodeIDs_z['node_id'].astype('int') filename_x=kmc_path+'/'+kcc_name+'_x.csv' filename_y=kmc_path+'/'+kcc_name+'_y.csv' filename_z=kmc_path+'/'+kcc_name+'_z.csv' print('Saving KMCs to disk...') filtered_nodeIDs_x.to_csv(filename_x,index = False) filtered_nodeIDs_y.to_csv(filename_y,index = False) filtered_nodeIDs_z.to_csv(filename_z,index = False) stack=copviz.get_data_stacks(filtered_nodeIDs_x,filtered_nodeIDs_y,filtered_nodeIDs_z) copviz.plot_multiple_stacks(stack,kmc_plot_path+'/'+kcc_name)