From d8696ea56a81dac885e4f9b37debb62a0e7278f6 Mon Sep 17 00:00:00 2001 From: "Streams, James (CPT)" <jestream@nps.edu> Date: Tue, 28 May 2019 10:34:20 -0700 Subject: [PATCH] latest version of algorithm --- ADOE4HPC.R | 322 +++++++++++++++++++++++++++++++++++++++++++++++ finalModel.R | 31 +++++ modeling3.R | 34 +++++ newadptFolder3.R | 8 ++ response4.R | 61 +++++++++ sigFactors.R | 25 ++++ 6 files changed, 481 insertions(+) create mode 100644 ADOE4HPC.R create mode 100644 finalModel.R create mode 100644 modeling3.R create mode 100644 newadptFolder3.R create mode 100644 response4.R create mode 100644 sigFactors.R diff --git a/ADOE4HPC.R b/ADOE4HPC.R new file mode 100644 index 0000000..48ea264 --- /dev/null +++ b/ADOE4HPC.R @@ -0,0 +1,322 @@ +########################################################################################## +################## ADOE Code ########################## +########################################################################################## + +# Purpose: Aid analyst in analysis by allowing the program to determine the best range +# the factor inputs should be to obtain an objective. + +########################################################################################## +# Packages used: + +library(XML) # Tools for parsing and generating XML within R +library(stringr) # Common string operations +library(dplyr) # Data manipulation +library(DiceDesign) # Generates a DOEs +library(tidyr) # Easily tidy data with the spread() or gather() function +library(data.table) # Extension of data frame +library(bestglm) # Best subset GLM and regression utilities +library(gtools) # Order string experssions; mixedsort() +library(parallel) # Allows for parallel computing +# library(DT) # Function to print datatable +print('Libraries loaded...') + +# Set initial working directory +setwd("/home/jestream/LBCv5/ADOE/") + +### Load DoE Generation Functions +# Functions here include loading base case XML, creating experimental design, and generating XML scenarios for exection +source('DOEGen.R') # Loads DoE Generator Function (newDOE) based on 'DiceDesign' NOLH function. +source('scenCreator.R') # Loads function to take design and base XML (makeXMLs) to create required number of XMLs to run +source('gatherxmls.R') # Creates a list of XMLs to process through the simulation +source('runSim.R') # Loads function that runs DoE in parallel. +source('remove.R') # Remove scenarios files to clear room on hard drive +source('response4.R') # Gets the response variable from the data +source('modeling3.R') # generates the model +source('adaptDOEGen2.R') # DOE generator for the adaptive portion of the program +source('newadptFolder3.R') # Creates new folder for storage of each iterations data +source('finalModel.R') # Displays the results and the final plot of the found meta-model +source('sigFactors.R') # Displays the found significant factors from the meta-model +print('Functions loaded...') + +# Location of LBC or Simulation to use +lbcFolder <- "/home/jestream/LBCv5/" + +# xml files location +setwd("/home/jestream/LBCv5/data/excel/") + +# The xml file to parse for analysis +xmlfile = "SCEN7-Phase-3-5NOV2018-90.v2.xml" + +# Where to store the data +StorageLoc <- "/home/jestream/LBCv5/data/" + +# Name of the files for storage +namefile <- 'finaltest' + +# identify the directory where the files are to be located +Dir = paste0(StorageLoc,namefile,"/") +Dir2 = paste0(StorageLoc,namefile,"/") + +# the number of replications per design point +numReps = 30 + +# Sets the initial range of value for the first DOE +p = 0.15 + +# Sets the range of min amd max values for follow on DOEs +p2 = 0.10 + +# Starting value for iterations, should remain at 1 unless starting from a different iteration +k = 1 + +# Stopping condition for adaptive process +z = 0.1 # percent of residuals outside of acceptable range +stopVal <- 0.004 # minimum change needed to continue iteration +iterThres = 2 # number of iterations in a row the model is allowed to iterate wwithout improvement +maxIter = 12 + +# Values for displaying significant factors +alpha = 0.05 # The minimum p-value a factor must have to be considered significant +n = 10 # The max number of significant factors to return + +########################################################################################## +################## Search Parameters in XML files ########################## +########################################################################################## +pstart = Sys.time() # starts the clock to time how long the program takes to complete +print('Parsing XML file...') +# Breaks the xmlFile into sections and saves a temp version for data extraction +xmldata <- xmlTreeParse(xmlfile, useInternal = TRUE) + +# Locates the a number of replications value +numReplications = getNodeSet(xmldata, "//LBCAssembly[@numReplications]") + +# find all ThresholdReorderLogic Nodes +ReorderNodes=getNodeSet(xmldata, "//ThresholdReorderLogic") + +# Locates all the reorder point data (this section can be adjusted to look for other +# areas of interest such as consumption rate or truck capacities) +ReorderValueNode = getNodeSet(xmldata, "//ThresholdReorderLogic//*[@value]") +numNodes = length(ReorderValueNode) + +# Create lists to collect information from xmlFile +numNodesName=character() +nodeNum = integer() +reps = character() +name = character() +Company = character() +UnitId = character() +Consumable =character() +value = character() + +# this loop will return all required information from the parent nodes in reference +# to the attribute value name +for (i in 1:numNodes){ + parent = xmlParent(ReorderValueNode[[i]]) + child = xmlChildren(ReorderNodes[[1]])[i] + grandParent = xmlParent(xmlParent(ReorderValueNode[[i]])) + nodeNum[i] = i + name[i] = xmlAttrs(parent) #gets the methodName of the parent node in reference to the attribute value name + reps[i] = xmlAttrs(numReplications[[1]])['numReplications'] + + # use the name to find the Company Name + providerNode = getNodeSet(grandParent, ".//*[@refId]") + + # extract the consumable + Consumable[i]= xmlAttrs(providerNode[[1]])['refId'] + + # extract the Company Name + UnitId[i]= xmlAttrs(providerNode[[2]])['refId'] + + # extract the value + valueNode = getNodeSet(parent, ".//*[@value]") + value[i]= xmlAttrs(valueNode[[1]])['value'] + +} +# Data frame of collected nodes from entire xmlFile +ReorderDf = data.frame(nodeNum, reps, name, UnitId, Consumable, value) +print('Parsing complete...') +########################################################################################## +################## Locating Factors of Interest ########################## +########################################################################################## +# Choose the units or factors of interest through the use of dplyr filters + +# This can be changed to look for any unit or factor based on the information available +# from the xml data collection + +Div4 <- ReorderDf %>% select(nodeNum, reps, name, UnitId, Consumable, value) %>% + filter(str_detect(UnitId, "DIV-4")) %>% filter(!str_detect(UnitId, paste(c("BSB", "FSC", "DISTRO", "ALLIED", "SUST", "EN-BDE", + "HHC", "HHB", "HQ", "BEB","MEB","MI-BN"), collapse = "|"))) %>% filter(!str_detect(Consumable, paste(c("CARGO", "WATER"),collapse = "|"))) + +# Enusre that MASS is not running at the same time as dplyr or else it will mask the select function in dplyr + +mainFactors <- Div4 %>% filter(str_detect(Consumable, "AMMO")) # replace with your choosen data set +mainFactors$value <- as.numeric(levels(mainFactors$value))[mainFactors$value] # values to vary to create a range +# Code found at +# https://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-integer-numeric-without-loss-of-information + +# Now set the range of min and max values + +# add those values to the data frame +mainFactors['min'] <- (mainFactors$value - mainFactors$value*p) +mainFactors['max'] <- (mainFactors$value*(1+p)) + +# From DOEGen.R +ammoDOE <- newDOE(mainFactors) + +# Creates folder for initial storage of data, it is based on the Dir set at the beginning +dir.create(Dir) + +writeScenarios(ammoDOE) +files <- getXMLs(Dir) + +########################################################################################## +# Runs each scenario through LBC, this takes awhile since it has to shut down and restart LBC each time +setwd(lbcFolder) + +runEx(files, Dir) +removeFiles(files) + +########################################################################################## +################ Gather the information for the response varialbe #################### +########################################################################################## + +# Finds all the csv files and then puts the names in a list +csvL <- list.files(path = Dir, pattern = '*.csv') +csvL <- mixedsort(csvL) + +# Creates an empty data frame for information collection, +# this can be modified based on what desired information the user is looking for +results <- data.frame(matrix(ncol = 5)) +colnames(results) <-c('DP', 'REP', 'Bad', 'Total', '%') + + +test <- getResponse(csvs = csvL, DOE = ammoDOE) +write.table(test, file = paste0(Dir,'test.csv'), sep = ',') +########################################################################################## +# need to include code that saves each linear model and test data to the folder holding the data +par(mfrow = c(2,2)) +test.mod <- getModel(test) + +# Save the current DOE +curDOE <- test[,1:nrow(mainFactors)] +curDOE <- as.data.frame(t(curDOE)) +write.table(curDOE, file = paste0(Dir,'DOE for initial .csv'), sep = ',') + +########################################################################################## +# Storing percent values for additional stopping criteria +percents <- list() +percents <- append(percents, test.mod[1]) +c = 0 + +########################################################################################## +################ Adaptive Portion, as long as it meets conditions #################### +########################################################################################## +while (test.mod[1] > z && c < iterThres && k < maxIter){ + if(k == 1){ + print(paste('Now executing iteration',k)) + + DOE <- aDOE(test, test.mod[2], mainFactors) + + drive <- newFolder(StorageLoc, k) + + Dir = paste0(drive[1],'/') + Dir2 = paste0(drive[2],'/') + + setwd(lbcFolder) + + writeScenarios(DOE) + + files <- getXMLs(Dir) + + runEx(files, Dir) + + removeFiles(files) + + # Finds all the csv files and then puts the names in a list + csvL <- list.files(path = Dir, pattern = '*.csv') + + # Creates an empty data frame for information collection, + # this can be modified based on what desired information the user is looking for + results <- data.frame(matrix(ncol = 5)) + colnames(results) <-c('DP', 'REP', 'Bad', 'Total', '%') + + newTest <- getResponse(csvs = csvL, DOE = DOE) + test <- rbind(test,newTest) + + curDOE <- test[,1:nrow(mainFactors)] + curDOE <- as.data.frame(t(curDOE)) + write.table(curDOE, file = paste0(Dir,'DOE_for_turn ',k,'.csv'), sep = ',') + + par(mfrow = c(2,2)) + test.mod <- getModel(test) + + percents <- append(percents, test.mod[1]) + diff <- abs(percents[[k+1]] - percents[[k]]) + if(diff > stopVal){ + c = 0 + }else{ + c = c + 1 + } + + + k = k + 1 + + } else { + + print(paste('Now executing iteration', k)) + DOE <- aDOE(test, test.mod[2], mainFactors) + drive <- newFolder(StorageLoc, k) + + Dir = paste0(drive[1],'/') + Dir2 = paste0(drive[2],'/') + + setwd(lbcFolder) + + writeScenarios(DOE) + + files <- getXMLs(Dir) + + runEx(files, Dir) + + removeFiles(files) + + csvL <- list.files(path = Dir, pattern = '*.csv') + + results <- data.frame(matrix(ncol = 3, nrow = length(csvL))) + colnames(results) <- c('All','Total', 'All%') + + newTest <- getResponse(csvs = csvL, DOE = DOE) + test <- rbind(test,newTest) + + curDOE <- test[,1:nrow(mainFactors)] + curDOE <- as.data.frame(t(curDOE)) + + write.table(curDOE, file = paste0(Dir,'DOE_for_turn ',k,'.csv'), sep = ',') + + + par(mfrow = c(2,2)) + test.mod <- getModel(test) + + percents <- append(percents, test.mod[1]) + diff <- abs(percents[[k+1]] - percents[[k]]) + if(diff > stopVal){ + c = 0 + }else{ + c = c + 1 + } + + k = k + 1 + } +} + +final.mod <- finalModel(test) + +sigFactors(final.mod) + +# End program +print('Program complete') +pend = Sys.time() +print(pend - pstart) + + + diff --git a/finalModel.R b/finalModel.R new file mode 100644 index 0000000..511a0ed --- /dev/null +++ b/finalModel.R @@ -0,0 +1,31 @@ +finalModel <- function(data){ + # Find the final model + final.mod <- lm(y~(.)^2, data = data) + + + favg <- mean(final.mod$residuals) + fsigma <- sd(final.mod$residuals) + + fub <- favg + 2*fsigma + flb <- favg - 2*fsigma + + resid <- sum(final.mod$residuals>fub) + sum(sum(final.mod$residuals<flb)) + print(paste0('The number of residuals outside of the accepted range is ', resid,' out of ',nrow(data), '.')) + print(paste('The program took', k, 'iterations to find the metamodel.')) + print(paste0('The percentage of residuals within the acceptable is ', round((1-resid/nrow(data)),4)*100,'%.')) + if(test.mod[1] > z){ + print(paste0('Algorithm stopped due to failing to improve by the specified threshold for 2 iterations')) + }else{ + print('Algorithm stopped due to reaching enough residuals within an accepted range.') + } + + # plot the Residuals and QQ PLot + par(mfrow = c(2,1)) + plot(final.mod, which = c(2,1)) # prints the residual vs fitted graph + abline(plot(final.mod, which = c(1,1)),h = fub, col = 'red', lty = 'dashed') + abline(h = flb, col = 'red', lty = 'dashed') + par(new = T) + # https://stackoverflow.com/questions/2564258/plot-two-graphs-in-same-plot-in-r + plot(final.mod, which = c(2,1)) + return(final.mod) +} \ No newline at end of file diff --git a/modeling3.R b/modeling3.R new file mode 100644 index 0000000..7b251d0 --- /dev/null +++ b/modeling3.R @@ -0,0 +1,34 @@ +########################################################################################## +################ Gather the information for the response varialbe #################### +########################################################################################## + +getModel <- function(data){ + print('Conducting regression analysis of the data...') + # This will find the resdiuals that fall within a certain range and return the number of + # residuals that are outside of the range and DP with the highest residual + lm.mod <- lm(y~(.)^2, data = data) + print('Model Complete..') + setwd(Dir) + print('Saving model to folder...') + save(lm.mod, file = 'lnmod.RData') + # https://stackoverflow.com/questions/14761496/saving-and-loading-a-model-in-r + print('Saving data frame to folder..') + write.table(data, file = 'model data.csv', sep = ',') + # https://datascienceplus.com/exporting-data-from-r/ + + avg <- mean(lm.mod$residuals) + sigma <- sd(lm.mod$residuals) + + ub <- avg + 2*sigma + lb <- avg - 2*sigma + + resid <- sum(lm.mod$residuals>ub) + sum(lm.mod$residuals<lb) + percent <- resid/nrow(data) + print(paste0('The number of residuals outside of the accepted range is ',resid,'.')) + print(paste0('The current length of the DOE is ', nrow(data),'.')) + print(paste0('The percentage of residuals outside of the accepted range is ', round(percent,5), '.')) + newCen <- which.max(abs(lm.mod$residuals)) + print(paste0('Building new DOE using design point ', names(lm.mod$residuals[newCen]),'.')) + print('Modeling complete...') + return(c(percent,newCen, plot(lm.mod))) +} \ No newline at end of file diff --git a/newadptFolder3.R b/newadptFolder3.R new file mode 100644 index 0000000..391d6e2 --- /dev/null +++ b/newadptFolder3.R @@ -0,0 +1,8 @@ +newFolder <- function(path, k){ + dir.create(paste0(path,filename,k )) # Create folder for storage + + Dir = paste0(path,filename,k) + Dir2 = paste0(path,filename,k) + + return(c(Dir, Dir2)) +} \ No newline at end of file diff --git a/response4.R b/response4.R new file mode 100644 index 0000000..c492b83 --- /dev/null +++ b/response4.R @@ -0,0 +1,61 @@ +########################################################################################## +getResponse <- function(csvs,DOE){ + print('Generating Reponse for each experiment...') + start_time4 <- Sys.time() + + for (i in 1:length(csvs)){ + + outputData1 <- fread(paste0(Dir,csvs[i])) + + # print(paste0('The number of replications imported ', length(unique(outputData1$replication)), '.')) + + ## Get Data for Supply Minimum Supply Table, must be reactive value to avoid conflicts + supplyTable <- outputData1 %>% dplyr::filter(propertyName=="balanceOnHand") %>% filter(str_detect(`entityName`, "DIV-4")) %>% collect() + + + # Find minimum supply level reached by unit/commodity, excluding convoys + minsupplyTable <- supplyTable %>% filter(!str_detect(`data`,"Convoy")) %>% + group_by(replication, entityName, data) %>% + summarise(minOnHand=min(as.numeric(newValue), na.rm=TRUE)) %>% + ungroup() + + # Find starting supply level by unit/commodity, excluding convoys + minPercentTable<-supplyTable%>% + filter(!str_detect(`data`,"Convoy")) %>% + filter(!str_detect(`entityName`,"UNLIMITED")) %>% + dplyr::select(-eventName, -propertyName, -oldValue) %>% + group_by(replication, entityName, data) %>% + mutate(minOnHand=min(as.numeric(newValue), na.rm=TRUE), minTime=min(time)) %>% + ungroup() %>% + filter(time==minTime) %>% + mutate(minPercOnHand=minOnHand/as.numeric(newValue)) %>% + dplyr::select(data,replication, entityName, minPercOnHand) + + + sub <- minPercentTable %>% group_by(replication) %>% filter(minPercOnHand < .60) %>% tally() + things <- data.frame(matrix(ncol=5)) + colnames(things) <- c('DP ', 'REP', 'Bad', 'Total', '%') + for (j in 1:nrow(sub)){ + things[j,1]<- paste0('DP',i) + things[j,2] <- j + things[j,3] <- sub[j,2] + things[j,4] <- nrow(minPercentTable[minPercentTable$replication == j,]) + things[j,5] <- things[j,3]/things[j,4] + } + if (i ==1){ + results <- things + }else{ + results <- rbind(results, things) + } + + } + # Applies response variables to the DOE + inputdf <- data.frame(t(DOE)) + repdf <- inputdf[rep(seq_len(nrow(inputdf)),each = numReps),] + test <- repdf[1:nrow(repdf),] + test$y <- results[,5] + print('Data complete for regression analysis') + end_time4 <- Sys.time() + print(total4 <- end_time4 - start_time4) + return(test) +} \ No newline at end of file diff --git a/sigFactors.R b/sigFactors.R new file mode 100644 index 0000000..af1170d --- /dev/null +++ b/sigFactors.R @@ -0,0 +1,25 @@ +sigFactors <- function(mod){ + # Diplay the top 10 significant factors of the model + + focus <- data.frame(names(mod$coefficients[!is.na(mod$coefficients)])) + # https://stackoverflow.com/questions/16724148/getting-predictor-names-in-r-regression + + focus$pvalue <- round(coef(summary(mod))[,4],5) + # https://stackoverflow.com/questions/23838937/extract-pvalue-from-glm + + focus$coeff <- round(coef(summary(mod))[,1],5) + colnames(focus)[1] <- 'Factors' + focus <- focus[!focus$Factors == '(Intercept)',] + impFac <- focus[focus$pvalue < alpha,] + top10 <- impFac[order(abs(impFac$coeff), decreasing = TRUE),] + top10$coeff <- format(top10$coeff, scientific = FALSE) + top10 <- top10[complete.cases(top10),] + print(top10[1:n,]) + + if(nrow(top10) < n){ + print('More iterations or adjustments to inital settings maybe needed to find more or different significant factors') + }else{ + print(paste0('These are the top ',n, ' out of ',nrow(top10), ' significant factors.')) + } + return(datatable(top10)) +} -- GitLab