From d8696ea56a81dac885e4f9b37debb62a0e7278f6 Mon Sep 17 00:00:00 2001
From: "Streams, James (CPT)" <jestream@nps.edu>
Date: Tue, 28 May 2019 10:34:20 -0700
Subject: [PATCH] latest version of algorithm

---
 ADOE4HPC.R       | 322 +++++++++++++++++++++++++++++++++++++++++++++++
 finalModel.R     |  31 +++++
 modeling3.R      |  34 +++++
 newadptFolder3.R |   8 ++
 response4.R      |  61 +++++++++
 sigFactors.R     |  25 ++++
 6 files changed, 481 insertions(+)
 create mode 100644 ADOE4HPC.R
 create mode 100644 finalModel.R
 create mode 100644 modeling3.R
 create mode 100644 newadptFolder3.R
 create mode 100644 response4.R
 create mode 100644 sigFactors.R

diff --git a/ADOE4HPC.R b/ADOE4HPC.R
new file mode 100644
index 0000000..48ea264
--- /dev/null
+++ b/ADOE4HPC.R
@@ -0,0 +1,322 @@
+##########################################################################################
+##################                ADOE Code                     ##########################
+##########################################################################################
+
+# Purpose: Aid analyst in analysis by allowing the program to determine the best range
+# the factor inputs should be to obtain an objective.
+
+##########################################################################################
+# Packages used:
+
+library(XML)        # Tools for parsing and generating XML within R
+library(stringr)    # Common string operations
+library(dplyr)      # Data manipulation
+library(DiceDesign) # Generates a DOEs
+library(tidyr)      # Easily tidy data with the spread() or gather() function
+library(data.table) # Extension of data frame
+library(bestglm)    # Best subset GLM and regression utilities
+library(gtools)     # Order string experssions; mixedsort()
+library(parallel)   # Allows for parallel computing
+# library(DT)         # Function to print datatable
+print('Libraries loaded...')
+
+# Set initial working directory
+setwd("/home/jestream/LBCv5/ADOE/")
+
+### Load DoE Generation Functions
+# Functions here include loading base case XML, creating experimental design, and generating XML scenarios for exection
+source('DOEGen.R')  # Loads DoE Generator Function (newDOE) based on 'DiceDesign' NOLH function.  
+source('scenCreator.R') # Loads function to take design and base XML (makeXMLs) to create required number of XMLs to run 
+source('gatherxmls.R') # Creates a list of XMLs to process through the simulation
+source('runSim.R') # Loads function that runs DoE in parallel.
+source('remove.R') # Remove scenarios files to clear room on hard drive
+source('response4.R') # Gets the response variable from the data
+source('modeling3.R') # generates the model
+source('adaptDOEGen2.R') # DOE generator for the adaptive portion of the program
+source('newadptFolder3.R') # Creates new folder for storage of each iterations data
+source('finalModel.R') # Displays the results and the final plot of the found meta-model
+source('sigFactors.R') # Displays the found significant factors from the meta-model
+print('Functions loaded...')
+
+# Location of LBC or Simulation to use
+lbcFolder <- "/home/jestream/LBCv5/"
+
+# xml files location
+setwd("/home/jestream/LBCv5/data/excel/")
+
+# The xml file to parse for analysis
+xmlfile = "SCEN7-Phase-3-5NOV2018-90.v2.xml"
+
+# Where to store the data
+StorageLoc <- "/home/jestream/LBCv5/data/"
+
+# Name of the files for storage
+namefile <- 'finaltest'
+
+# identify the directory where the files are to be located
+Dir = paste0(StorageLoc,namefile,"/")
+Dir2 = paste0(StorageLoc,namefile,"/")
+
+# the number of replications per design point
+numReps = 30     
+
+# Sets the initial range of value for the first DOE
+p = 0.15 
+
+# Sets the range of min amd max values for follow on DOEs
+p2 = 0.10
+
+# Starting value for iterations, should remain at 1 unless starting from a different iteration
+k = 1                           
+
+# Stopping condition for adaptive process
+z = 0.1 # percent of residuals outside of acceptable range
+stopVal <- 0.004 # minimum change needed to continue iteration
+iterThres = 2   # number of iterations in a row the model is allowed to iterate wwithout improvement
+maxIter = 12
+
+# Values for displaying significant factors
+alpha = 0.05  # The minimum p-value a factor must have to be considered significant
+n = 10        # The max number of significant factors to return 
+
+##########################################################################################
+##################       Search Parameters in XML files         ##########################
+##########################################################################################
+pstart = Sys.time() # starts the clock to time how long the program takes to complete
+print('Parsing XML file...')
+# Breaks the xmlFile into sections and saves a temp version for data extraction
+xmldata <- xmlTreeParse(xmlfile, useInternal = TRUE)
+
+# Locates the a number of replications value
+numReplications = getNodeSet(xmldata, "//LBCAssembly[@numReplications]") 
+
+# find all ThresholdReorderLogic Nodes
+ReorderNodes=getNodeSet(xmldata, "//ThresholdReorderLogic")
+
+# Locates all the reorder point data (this section can be adjusted to look for other 
+#                         areas of interest such as consumption rate or truck capacities)
+ReorderValueNode = getNodeSet(xmldata, "//ThresholdReorderLogic//*[@value]")
+numNodes = length(ReorderValueNode)
+
+# Create lists to collect information from xmlFile
+numNodesName=character()
+nodeNum = integer()
+reps = character()
+name = character()
+Company = character()
+UnitId = character()
+Consumable =character()
+value = character()
+
+# this loop will return all required information from the parent nodes in reference 
+# to the attribute value name
+for (i in 1:numNodes){
+  parent = xmlParent(ReorderValueNode[[i]])
+  child = xmlChildren(ReorderNodes[[1]])[i]
+  grandParent = xmlParent(xmlParent(ReorderValueNode[[i]]))
+  nodeNum[i] = i
+  name[i] = xmlAttrs(parent) #gets the methodName of the parent node in reference to the attribute value name
+  reps[i] = xmlAttrs(numReplications[[1]])['numReplications']
+  
+  # use the name to find the Company Name
+  providerNode = getNodeSet(grandParent, ".//*[@refId]")
+  
+  # extract the consumable
+  Consumable[i]= xmlAttrs(providerNode[[1]])['refId']
+  
+  # extract the Company Name
+  UnitId[i]= xmlAttrs(providerNode[[2]])['refId']
+  
+  # extract the value
+  valueNode = getNodeSet(parent, ".//*[@value]")
+  value[i]= xmlAttrs(valueNode[[1]])['value']
+  
+}
+# Data frame of collected nodes from entire xmlFile
+ReorderDf = data.frame(nodeNum, reps, name, UnitId, Consumable, value)
+print('Parsing complete...')
+##########################################################################################
+##################      Locating Factors of Interest            ##########################
+##########################################################################################
+# Choose the units or factors of interest through the use of dplyr filters
+
+# This can be changed to look for any unit or factor based on the information available 
+# from the xml data collection
+
+Div4 <- ReorderDf %>% select(nodeNum, reps, name, UnitId, Consumable, value) %>% 
+  filter(str_detect(UnitId, "DIV-4")) %>% filter(!str_detect(UnitId, paste(c("BSB", "FSC", "DISTRO", "ALLIED", "SUST", "EN-BDE", 
+                                                                             "HHC", "HHB", "HQ", "BEB","MEB","MI-BN"), collapse = "|"))) %>% filter(!str_detect(Consumable, paste(c("CARGO", "WATER"),collapse = "|")))
+
+# Enusre that MASS is not running at the same time as dplyr or else it will mask the select function in dplyr
+
+mainFactors <- Div4 %>% filter(str_detect(Consumable, "AMMO"))          # replace with your choosen data set
+mainFactors$value <- as.numeric(levels(mainFactors$value))[mainFactors$value] # values to vary to create a range
+# Code found at
+# https://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-integer-numeric-without-loss-of-information
+
+# Now set the range of min and max values
+
+# add those values to the data frame
+mainFactors['min'] <- (mainFactors$value - mainFactors$value*p)
+mainFactors['max'] <- (mainFactors$value*(1+p))
+
+# From DOEGen.R 
+ammoDOE <- newDOE(mainFactors)
+
+# Creates folder for initial storage of data, it is based on the Dir set at the beginning
+dir.create(Dir)
+
+writeScenarios(ammoDOE)
+files <- getXMLs(Dir)
+
+##########################################################################################
+# Runs each scenario through LBC, this takes awhile since it has to shut down and restart LBC each time
+setwd(lbcFolder)
+
+runEx(files, Dir)
+removeFiles(files)
+
+##########################################################################################
+################   Gather the information for the response varialbe   ####################
+##########################################################################################
+
+# Finds all the csv files and then puts the names in a list
+csvL <- list.files(path = Dir, pattern = '*.csv')
+csvL <- mixedsort(csvL)
+
+# Creates an empty data frame for information collection, 
+# this can be modified based on what desired information the user is looking for
+results <- data.frame(matrix(ncol = 5))
+colnames(results) <-c('DP', 'REP', 'Bad', 'Total', '%')
+
+
+test <- getResponse(csvs = csvL, DOE = ammoDOE)
+write.table(test, file = paste0(Dir,'test.csv'), sep = ',')
+##########################################################################################
+# need to include code that saves each linear model and test data to the folder holding the data
+par(mfrow = c(2,2))
+test.mod <- getModel(test)
+
+# Save the current DOE
+curDOE <- test[,1:nrow(mainFactors)]
+curDOE <- as.data.frame(t(curDOE))
+write.table(curDOE, file = paste0(Dir,'DOE for initial .csv'), sep = ',')
+
+##########################################################################################
+# Storing percent values for additional stopping criteria
+percents <- list()
+percents <- append(percents, test.mod[1])
+c = 0
+
+##########################################################################################
+################   Adaptive Portion, as long as it meets conditions   ####################
+##########################################################################################
+while (test.mod[1] > z && c < iterThres && k < maxIter){
+  if(k == 1){
+    print(paste('Now executing iteration',k))
+    
+    DOE <- aDOE(test, test.mod[2], mainFactors)
+    
+    drive <- newFolder(StorageLoc, k)
+    
+    Dir = paste0(drive[1],'/')
+    Dir2 = paste0(drive[2],'/')
+    
+    setwd(lbcFolder) 
+    
+    writeScenarios(DOE)
+    
+    files <- getXMLs(Dir)
+    
+    runEx(files, Dir)
+    
+    removeFiles(files)
+    
+    # Finds all the csv files and then puts the names in a list
+    csvL <- list.files(path = Dir, pattern = '*.csv')
+    
+    # Creates an empty data frame for information collection, 
+    # this can be modified based on what desired information the user is looking for
+    results <- data.frame(matrix(ncol = 5))
+    colnames(results) <-c('DP', 'REP', 'Bad', 'Total', '%')
+    
+    newTest <- getResponse(csvs = csvL, DOE = DOE)
+    test <- rbind(test,newTest)
+    
+    curDOE <- test[,1:nrow(mainFactors)]
+    curDOE <- as.data.frame(t(curDOE))
+    write.table(curDOE, file = paste0(Dir,'DOE_for_turn ',k,'.csv'), sep = ',')
+    
+    par(mfrow = c(2,2))
+    test.mod <- getModel(test)
+    
+    percents <- append(percents, test.mod[1])
+    diff <- abs(percents[[k+1]] - percents[[k]])
+    if(diff > stopVal){
+      c = 0
+    }else{
+      c = c + 1
+    }
+    
+    
+    k = k + 1
+    
+  } else {
+    
+    print(paste('Now executing iteration', k))
+    DOE <- aDOE(test, test.mod[2], mainFactors)
+    drive <- newFolder(StorageLoc, k)
+    
+    Dir = paste0(drive[1],'/')
+    Dir2 = paste0(drive[2],'/')
+    
+    setwd(lbcFolder) 
+    
+    writeScenarios(DOE)
+    
+    files <- getXMLs(Dir)
+    
+    runEx(files, Dir)
+    
+    removeFiles(files)
+    
+    csvL <- list.files(path = Dir, pattern = '*.csv')
+    
+    results <- data.frame(matrix(ncol = 3, nrow = length(csvL)))
+    colnames(results) <- c('All','Total', 'All%')
+    
+    newTest <- getResponse(csvs = csvL, DOE = DOE)
+    test <- rbind(test,newTest)
+    
+    curDOE <- test[,1:nrow(mainFactors)]
+    curDOE <- as.data.frame(t(curDOE))
+    
+    write.table(curDOE, file = paste0(Dir,'DOE_for_turn ',k,'.csv'), sep = ',')
+    
+    
+    par(mfrow = c(2,2))
+    test.mod <- getModel(test)
+    
+    percents <- append(percents, test.mod[1])
+    diff <- abs(percents[[k+1]] - percents[[k]])
+    if(diff > stopVal){
+      c = 0
+    }else{
+      c = c + 1
+    }
+    
+    k = k + 1
+  }
+}  
+
+final.mod <- finalModel(test)
+
+sigFactors(final.mod)
+
+# End program
+print('Program complete')
+pend = Sys.time()
+print(pend - pstart)
+  
+
+
diff --git a/finalModel.R b/finalModel.R
new file mode 100644
index 0000000..511a0ed
--- /dev/null
+++ b/finalModel.R
@@ -0,0 +1,31 @@
+finalModel <- function(data){
+  # Find the final model
+  final.mod <- lm(y~(.)^2, data = data) 
+  
+  
+  favg <- mean(final.mod$residuals)
+  fsigma <- sd(final.mod$residuals)
+  
+  fub <- favg + 2*fsigma
+  flb <- favg - 2*fsigma
+  
+  resid <- sum(final.mod$residuals>fub) + sum(sum(final.mod$residuals<flb))
+  print(paste0('The number of residuals outside of the accepted range is ', resid,' out of ',nrow(data), '.'))
+  print(paste('The program took', k, 'iterations to find the metamodel.'))
+  print(paste0('The percentage of residuals within the acceptable is ', round((1-resid/nrow(data)),4)*100,'%.'))
+  if(test.mod[1] > z){
+    print(paste0('Algorithm stopped due to failing to improve by the specified threshold for 2 iterations'))
+  }else{
+    print('Algorithm stopped due to reaching enough residuals within an accepted range.')
+  }
+  
+  # plot the Residuals and QQ PLot
+  par(mfrow = c(2,1))
+  plot(final.mod, which = c(2,1)) # prints the residual vs fitted graph
+  abline(plot(final.mod, which = c(1,1)),h = fub, col = 'red', lty = 'dashed')
+  abline(h = flb, col = 'red', lty = 'dashed')
+  par(new = T)
+  # https://stackoverflow.com/questions/2564258/plot-two-graphs-in-same-plot-in-r
+  plot(final.mod, which = c(2,1))
+  return(final.mod)
+}
\ No newline at end of file
diff --git a/modeling3.R b/modeling3.R
new file mode 100644
index 0000000..7b251d0
--- /dev/null
+++ b/modeling3.R
@@ -0,0 +1,34 @@
+##########################################################################################
+################   Gather the information for the response varialbe   ####################
+##########################################################################################
+
+getModel <- function(data){
+  print('Conducting regression analysis of the data...')
+  # This will find the resdiuals that fall within a certain range and return the number of 
+  # residuals that are outside of the range and DP with the highest residual
+  lm.mod <- lm(y~(.)^2, data = data) 
+  print('Model Complete..')
+  setwd(Dir)
+  print('Saving model to folder...')
+  save(lm.mod, file = 'lnmod.RData')
+  # https://stackoverflow.com/questions/14761496/saving-and-loading-a-model-in-r
+  print('Saving data frame to folder..')
+  write.table(data, file = 'model data.csv', sep = ',')
+  # https://datascienceplus.com/exporting-data-from-r/
+
+  avg <- mean(lm.mod$residuals)
+  sigma <- sd(lm.mod$residuals)
+  
+  ub <- avg + 2*sigma 
+  lb <- avg - 2*sigma
+
+  resid <- sum(lm.mod$residuals>ub) + sum(lm.mod$residuals<lb)
+  percent <- resid/nrow(data)
+  print(paste0('The number of residuals outside of the accepted range is ',resid,'.'))
+  print(paste0('The current length of the DOE is ', nrow(data),'.'))
+  print(paste0('The percentage of residuals outside of the accepted range is ', round(percent,5), '.'))
+  newCen <- which.max(abs(lm.mod$residuals))
+  print(paste0('Building new DOE using design point ', names(lm.mod$residuals[newCen]),'.'))
+  print('Modeling complete...')
+  return(c(percent,newCen, plot(lm.mod)))
+}
\ No newline at end of file
diff --git a/newadptFolder3.R b/newadptFolder3.R
new file mode 100644
index 0000000..391d6e2
--- /dev/null
+++ b/newadptFolder3.R
@@ -0,0 +1,8 @@
+newFolder <- function(path, k){
+  dir.create(paste0(path,filename,k ))   # Create folder for storage
+  
+  Dir = paste0(path,filename,k)
+  Dir2 = paste0(path,filename,k)
+  
+  return(c(Dir, Dir2))
+}
\ No newline at end of file
diff --git a/response4.R b/response4.R
new file mode 100644
index 0000000..c492b83
--- /dev/null
+++ b/response4.R
@@ -0,0 +1,61 @@
+##########################################################################################
+getResponse <- function(csvs,DOE){
+  print('Generating Reponse for each experiment...')
+  start_time4 <- Sys.time()
+  
+  for (i in 1:length(csvs)){
+    
+    outputData1 <- fread(paste0(Dir,csvs[i]))
+    
+    # print(paste0('The number of replications imported ', length(unique(outputData1$replication)), '.'))
+    
+    ## Get Data for Supply Minimum Supply Table, must be reactive value to avoid conflicts
+    supplyTable <- outputData1 %>% dplyr::filter(propertyName=="balanceOnHand") %>% filter(str_detect(`entityName`, "DIV-4")) %>% collect()
+    
+    
+    # Find minimum supply level reached by unit/commodity, excluding convoys
+    minsupplyTable <- supplyTable %>% filter(!str_detect(`data`,"Convoy")) %>% 
+      group_by(replication, entityName, data) %>% 
+      summarise(minOnHand=min(as.numeric(newValue), na.rm=TRUE)) %>%
+      ungroup()
+    
+    # Find starting supply level by unit/commodity, excluding convoys
+    minPercentTable<-supplyTable%>%
+      filter(!str_detect(`data`,"Convoy")) %>%
+      filter(!str_detect(`entityName`,"UNLIMITED")) %>%
+      dplyr::select(-eventName, -propertyName, -oldValue) %>%
+      group_by(replication, entityName, data) %>%
+      mutate(minOnHand=min(as.numeric(newValue), na.rm=TRUE), minTime=min(time))  %>%
+      ungroup() %>%
+      filter(time==minTime) %>%
+      mutate(minPercOnHand=minOnHand/as.numeric(newValue)) %>%
+      dplyr::select(data,replication, entityName, minPercOnHand)
+    
+    
+    sub <- minPercentTable %>% group_by(replication) %>% filter(minPercOnHand < .60)  %>% tally()
+    things <- data.frame(matrix(ncol=5))
+    colnames(things) <- c('DP ', 'REP', 'Bad', 'Total', '%')
+    for (j in 1:nrow(sub)){
+      things[j,1]<- paste0('DP',i)
+      things[j,2] <- j
+      things[j,3] <- sub[j,2]
+      things[j,4] <- nrow(minPercentTable[minPercentTable$replication == j,])
+      things[j,5] <- things[j,3]/things[j,4]
+    }
+    if (i ==1){
+      results <- things
+    }else{
+      results <- rbind(results, things)
+    }
+    
+  }
+  # Applies response variables to the DOE 
+  inputdf <- data.frame(t(DOE))
+  repdf <- inputdf[rep(seq_len(nrow(inputdf)),each = numReps),]  
+  test <- repdf[1:nrow(repdf),]
+  test$y <- results[,5]
+  print('Data complete for regression analysis')
+  end_time4 <- Sys.time()
+  print(total4 <- end_time4 - start_time4)
+  return(test)
+}
\ No newline at end of file
diff --git a/sigFactors.R b/sigFactors.R
new file mode 100644
index 0000000..af1170d
--- /dev/null
+++ b/sigFactors.R
@@ -0,0 +1,25 @@
+sigFactors <- function(mod){
+  # Diplay the top 10 significant factors of the model
+  
+  focus <- data.frame(names(mod$coefficients[!is.na(mod$coefficients)]))
+  # https://stackoverflow.com/questions/16724148/getting-predictor-names-in-r-regression
+  
+  focus$pvalue <- round(coef(summary(mod))[,4],5)
+  # https://stackoverflow.com/questions/23838937/extract-pvalue-from-glm
+  
+  focus$coeff <- round(coef(summary(mod))[,1],5)
+  colnames(focus)[1] <- 'Factors'
+  focus <- focus[!focus$Factors == '(Intercept)',]
+  impFac <- focus[focus$pvalue < alpha,]
+  top10 <- impFac[order(abs(impFac$coeff), decreasing = TRUE),]
+  top10$coeff <- format(top10$coeff, scientific = FALSE)
+  top10 <- top10[complete.cases(top10),]
+  print(top10[1:n,])
+  
+  if(nrow(top10) < n){
+    print('More iterations or adjustments to inital settings maybe needed to find more or different significant factors')
+  }else{
+    print(paste0('These are the top ',n, ' out of ',nrow(top10), ' significant factors.'))
+  }
+  return(datatable(top10))
+}
-- 
GitLab