Skip to content
Snippets Groups Projects
Commit d8696ea5 authored by Streams, James (CPT)'s avatar Streams, James (CPT)
Browse files

latest version of algorithm

parent 2084ff90
No related branches found
No related tags found
No related merge requests found
##########################################################################################
################## ADOE Code ##########################
##########################################################################################
# Purpose: Aid analyst in analysis by allowing the program to determine the best range
# the factor inputs should be to obtain an objective.
##########################################################################################
# Packages used:
library(XML) # Tools for parsing and generating XML within R
library(stringr) # Common string operations
library(dplyr) # Data manipulation
library(DiceDesign) # Generates a DOEs
library(tidyr) # Easily tidy data with the spread() or gather() function
library(data.table) # Extension of data frame
library(bestglm) # Best subset GLM and regression utilities
library(gtools) # Order string experssions; mixedsort()
library(parallel) # Allows for parallel computing
# library(DT) # Function to print datatable
print('Libraries loaded...')
# Set initial working directory
setwd("/home/jestream/LBCv5/ADOE/")
### Load DoE Generation Functions
# Functions here include loading base case XML, creating experimental design, and generating XML scenarios for exection
source('DOEGen.R') # Loads DoE Generator Function (newDOE) based on 'DiceDesign' NOLH function.
source('scenCreator.R') # Loads function to take design and base XML (makeXMLs) to create required number of XMLs to run
source('gatherxmls.R') # Creates a list of XMLs to process through the simulation
source('runSim.R') # Loads function that runs DoE in parallel.
source('remove.R') # Remove scenarios files to clear room on hard drive
source('response4.R') # Gets the response variable from the data
source('modeling3.R') # generates the model
source('adaptDOEGen2.R') # DOE generator for the adaptive portion of the program
source('newadptFolder3.R') # Creates new folder for storage of each iterations data
source('finalModel.R') # Displays the results and the final plot of the found meta-model
source('sigFactors.R') # Displays the found significant factors from the meta-model
print('Functions loaded...')
# Location of LBC or Simulation to use
lbcFolder <- "/home/jestream/LBCv5/"
# xml files location
setwd("/home/jestream/LBCv5/data/excel/")
# The xml file to parse for analysis
xmlfile = "SCEN7-Phase-3-5NOV2018-90.v2.xml"
# Where to store the data
StorageLoc <- "/home/jestream/LBCv5/data/"
# Name of the files for storage
namefile <- 'finaltest'
# identify the directory where the files are to be located
Dir = paste0(StorageLoc,namefile,"/")
Dir2 = paste0(StorageLoc,namefile,"/")
# the number of replications per design point
numReps = 30
# Sets the initial range of value for the first DOE
p = 0.15
# Sets the range of min amd max values for follow on DOEs
p2 = 0.10
# Starting value for iterations, should remain at 1 unless starting from a different iteration
k = 1
# Stopping condition for adaptive process
z = 0.1 # percent of residuals outside of acceptable range
stopVal <- 0.004 # minimum change needed to continue iteration
iterThres = 2 # number of iterations in a row the model is allowed to iterate wwithout improvement
maxIter = 12
# Values for displaying significant factors
alpha = 0.05 # The minimum p-value a factor must have to be considered significant
n = 10 # The max number of significant factors to return
##########################################################################################
################## Search Parameters in XML files ##########################
##########################################################################################
pstart = Sys.time() # starts the clock to time how long the program takes to complete
print('Parsing XML file...')
# Breaks the xmlFile into sections and saves a temp version for data extraction
xmldata <- xmlTreeParse(xmlfile, useInternal = TRUE)
# Locates the a number of replications value
numReplications = getNodeSet(xmldata, "//LBCAssembly[@numReplications]")
# find all ThresholdReorderLogic Nodes
ReorderNodes=getNodeSet(xmldata, "//ThresholdReorderLogic")
# Locates all the reorder point data (this section can be adjusted to look for other
# areas of interest such as consumption rate or truck capacities)
ReorderValueNode = getNodeSet(xmldata, "//ThresholdReorderLogic//*[@value]")
numNodes = length(ReorderValueNode)
# Create lists to collect information from xmlFile
numNodesName=character()
nodeNum = integer()
reps = character()
name = character()
Company = character()
UnitId = character()
Consumable =character()
value = character()
# this loop will return all required information from the parent nodes in reference
# to the attribute value name
for (i in 1:numNodes){
parent = xmlParent(ReorderValueNode[[i]])
child = xmlChildren(ReorderNodes[[1]])[i]
grandParent = xmlParent(xmlParent(ReorderValueNode[[i]]))
nodeNum[i] = i
name[i] = xmlAttrs(parent) #gets the methodName of the parent node in reference to the attribute value name
reps[i] = xmlAttrs(numReplications[[1]])['numReplications']
# use the name to find the Company Name
providerNode = getNodeSet(grandParent, ".//*[@refId]")
# extract the consumable
Consumable[i]= xmlAttrs(providerNode[[1]])['refId']
# extract the Company Name
UnitId[i]= xmlAttrs(providerNode[[2]])['refId']
# extract the value
valueNode = getNodeSet(parent, ".//*[@value]")
value[i]= xmlAttrs(valueNode[[1]])['value']
}
# Data frame of collected nodes from entire xmlFile
ReorderDf = data.frame(nodeNum, reps, name, UnitId, Consumable, value)
print('Parsing complete...')
##########################################################################################
################## Locating Factors of Interest ##########################
##########################################################################################
# Choose the units or factors of interest through the use of dplyr filters
# This can be changed to look for any unit or factor based on the information available
# from the xml data collection
Div4 <- ReorderDf %>% select(nodeNum, reps, name, UnitId, Consumable, value) %>%
filter(str_detect(UnitId, "DIV-4")) %>% filter(!str_detect(UnitId, paste(c("BSB", "FSC", "DISTRO", "ALLIED", "SUST", "EN-BDE",
"HHC", "HHB", "HQ", "BEB","MEB","MI-BN"), collapse = "|"))) %>% filter(!str_detect(Consumable, paste(c("CARGO", "WATER"),collapse = "|")))
# Enusre that MASS is not running at the same time as dplyr or else it will mask the select function in dplyr
mainFactors <- Div4 %>% filter(str_detect(Consumable, "AMMO")) # replace with your choosen data set
mainFactors$value <- as.numeric(levels(mainFactors$value))[mainFactors$value] # values to vary to create a range
# Code found at
# https://stackoverflow.com/questions/3418128/how-to-convert-a-factor-to-integer-numeric-without-loss-of-information
# Now set the range of min and max values
# add those values to the data frame
mainFactors['min'] <- (mainFactors$value - mainFactors$value*p)
mainFactors['max'] <- (mainFactors$value*(1+p))
# From DOEGen.R
ammoDOE <- newDOE(mainFactors)
# Creates folder for initial storage of data, it is based on the Dir set at the beginning
dir.create(Dir)
writeScenarios(ammoDOE)
files <- getXMLs(Dir)
##########################################################################################
# Runs each scenario through LBC, this takes awhile since it has to shut down and restart LBC each time
setwd(lbcFolder)
runEx(files, Dir)
removeFiles(files)
##########################################################################################
################ Gather the information for the response varialbe ####################
##########################################################################################
# Finds all the csv files and then puts the names in a list
csvL <- list.files(path = Dir, pattern = '*.csv')
csvL <- mixedsort(csvL)
# Creates an empty data frame for information collection,
# this can be modified based on what desired information the user is looking for
results <- data.frame(matrix(ncol = 5))
colnames(results) <-c('DP', 'REP', 'Bad', 'Total', '%')
test <- getResponse(csvs = csvL, DOE = ammoDOE)
write.table(test, file = paste0(Dir,'test.csv'), sep = ',')
##########################################################################################
# need to include code that saves each linear model and test data to the folder holding the data
par(mfrow = c(2,2))
test.mod <- getModel(test)
# Save the current DOE
curDOE <- test[,1:nrow(mainFactors)]
curDOE <- as.data.frame(t(curDOE))
write.table(curDOE, file = paste0(Dir,'DOE for initial .csv'), sep = ',')
##########################################################################################
# Storing percent values for additional stopping criteria
percents <- list()
percents <- append(percents, test.mod[1])
c = 0
##########################################################################################
################ Adaptive Portion, as long as it meets conditions ####################
##########################################################################################
while (test.mod[1] > z && c < iterThres && k < maxIter){
if(k == 1){
print(paste('Now executing iteration',k))
DOE <- aDOE(test, test.mod[2], mainFactors)
drive <- newFolder(StorageLoc, k)
Dir = paste0(drive[1],'/')
Dir2 = paste0(drive[2],'/')
setwd(lbcFolder)
writeScenarios(DOE)
files <- getXMLs(Dir)
runEx(files, Dir)
removeFiles(files)
# Finds all the csv files and then puts the names in a list
csvL <- list.files(path = Dir, pattern = '*.csv')
# Creates an empty data frame for information collection,
# this can be modified based on what desired information the user is looking for
results <- data.frame(matrix(ncol = 5))
colnames(results) <-c('DP', 'REP', 'Bad', 'Total', '%')
newTest <- getResponse(csvs = csvL, DOE = DOE)
test <- rbind(test,newTest)
curDOE <- test[,1:nrow(mainFactors)]
curDOE <- as.data.frame(t(curDOE))
write.table(curDOE, file = paste0(Dir,'DOE_for_turn ',k,'.csv'), sep = ',')
par(mfrow = c(2,2))
test.mod <- getModel(test)
percents <- append(percents, test.mod[1])
diff <- abs(percents[[k+1]] - percents[[k]])
if(diff > stopVal){
c = 0
}else{
c = c + 1
}
k = k + 1
} else {
print(paste('Now executing iteration', k))
DOE <- aDOE(test, test.mod[2], mainFactors)
drive <- newFolder(StorageLoc, k)
Dir = paste0(drive[1],'/')
Dir2 = paste0(drive[2],'/')
setwd(lbcFolder)
writeScenarios(DOE)
files <- getXMLs(Dir)
runEx(files, Dir)
removeFiles(files)
csvL <- list.files(path = Dir, pattern = '*.csv')
results <- data.frame(matrix(ncol = 3, nrow = length(csvL)))
colnames(results) <- c('All','Total', 'All%')
newTest <- getResponse(csvs = csvL, DOE = DOE)
test <- rbind(test,newTest)
curDOE <- test[,1:nrow(mainFactors)]
curDOE <- as.data.frame(t(curDOE))
write.table(curDOE, file = paste0(Dir,'DOE_for_turn ',k,'.csv'), sep = ',')
par(mfrow = c(2,2))
test.mod <- getModel(test)
percents <- append(percents, test.mod[1])
diff <- abs(percents[[k+1]] - percents[[k]])
if(diff > stopVal){
c = 0
}else{
c = c + 1
}
k = k + 1
}
}
final.mod <- finalModel(test)
sigFactors(final.mod)
# End program
print('Program complete')
pend = Sys.time()
print(pend - pstart)
finalModel <- function(data){
# Find the final model
final.mod <- lm(y~(.)^2, data = data)
favg <- mean(final.mod$residuals)
fsigma <- sd(final.mod$residuals)
fub <- favg + 2*fsigma
flb <- favg - 2*fsigma
resid <- sum(final.mod$residuals>fub) + sum(sum(final.mod$residuals<flb))
print(paste0('The number of residuals outside of the accepted range is ', resid,' out of ',nrow(data), '.'))
print(paste('The program took', k, 'iterations to find the metamodel.'))
print(paste0('The percentage of residuals within the acceptable is ', round((1-resid/nrow(data)),4)*100,'%.'))
if(test.mod[1] > z){
print(paste0('Algorithm stopped due to failing to improve by the specified threshold for 2 iterations'))
}else{
print('Algorithm stopped due to reaching enough residuals within an accepted range.')
}
# plot the Residuals and QQ PLot
par(mfrow = c(2,1))
plot(final.mod, which = c(2,1)) # prints the residual vs fitted graph
abline(plot(final.mod, which = c(1,1)),h = fub, col = 'red', lty = 'dashed')
abline(h = flb, col = 'red', lty = 'dashed')
par(new = T)
# https://stackoverflow.com/questions/2564258/plot-two-graphs-in-same-plot-in-r
plot(final.mod, which = c(2,1))
return(final.mod)
}
\ No newline at end of file
##########################################################################################
################ Gather the information for the response varialbe ####################
##########################################################################################
getModel <- function(data){
print('Conducting regression analysis of the data...')
# This will find the resdiuals that fall within a certain range and return the number of
# residuals that are outside of the range and DP with the highest residual
lm.mod <- lm(y~(.)^2, data = data)
print('Model Complete..')
setwd(Dir)
print('Saving model to folder...')
save(lm.mod, file = 'lnmod.RData')
# https://stackoverflow.com/questions/14761496/saving-and-loading-a-model-in-r
print('Saving data frame to folder..')
write.table(data, file = 'model data.csv', sep = ',')
# https://datascienceplus.com/exporting-data-from-r/
avg <- mean(lm.mod$residuals)
sigma <- sd(lm.mod$residuals)
ub <- avg + 2*sigma
lb <- avg - 2*sigma
resid <- sum(lm.mod$residuals>ub) + sum(lm.mod$residuals<lb)
percent <- resid/nrow(data)
print(paste0('The number of residuals outside of the accepted range is ',resid,'.'))
print(paste0('The current length of the DOE is ', nrow(data),'.'))
print(paste0('The percentage of residuals outside of the accepted range is ', round(percent,5), '.'))
newCen <- which.max(abs(lm.mod$residuals))
print(paste0('Building new DOE using design point ', names(lm.mod$residuals[newCen]),'.'))
print('Modeling complete...')
return(c(percent,newCen, plot(lm.mod)))
}
\ No newline at end of file
newFolder <- function(path, k){
dir.create(paste0(path,filename,k )) # Create folder for storage
Dir = paste0(path,filename,k)
Dir2 = paste0(path,filename,k)
return(c(Dir, Dir2))
}
\ No newline at end of file
##########################################################################################
getResponse <- function(csvs,DOE){
print('Generating Reponse for each experiment...')
start_time4 <- Sys.time()
for (i in 1:length(csvs)){
outputData1 <- fread(paste0(Dir,csvs[i]))
# print(paste0('The number of replications imported ', length(unique(outputData1$replication)), '.'))
## Get Data for Supply Minimum Supply Table, must be reactive value to avoid conflicts
supplyTable <- outputData1 %>% dplyr::filter(propertyName=="balanceOnHand") %>% filter(str_detect(`entityName`, "DIV-4")) %>% collect()
# Find minimum supply level reached by unit/commodity, excluding convoys
minsupplyTable <- supplyTable %>% filter(!str_detect(`data`,"Convoy")) %>%
group_by(replication, entityName, data) %>%
summarise(minOnHand=min(as.numeric(newValue), na.rm=TRUE)) %>%
ungroup()
# Find starting supply level by unit/commodity, excluding convoys
minPercentTable<-supplyTable%>%
filter(!str_detect(`data`,"Convoy")) %>%
filter(!str_detect(`entityName`,"UNLIMITED")) %>%
dplyr::select(-eventName, -propertyName, -oldValue) %>%
group_by(replication, entityName, data) %>%
mutate(minOnHand=min(as.numeric(newValue), na.rm=TRUE), minTime=min(time)) %>%
ungroup() %>%
filter(time==minTime) %>%
mutate(minPercOnHand=minOnHand/as.numeric(newValue)) %>%
dplyr::select(data,replication, entityName, minPercOnHand)
sub <- minPercentTable %>% group_by(replication) %>% filter(minPercOnHand < .60) %>% tally()
things <- data.frame(matrix(ncol=5))
colnames(things) <- c('DP ', 'REP', 'Bad', 'Total', '%')
for (j in 1:nrow(sub)){
things[j,1]<- paste0('DP',i)
things[j,2] <- j
things[j,3] <- sub[j,2]
things[j,4] <- nrow(minPercentTable[minPercentTable$replication == j,])
things[j,5] <- things[j,3]/things[j,4]
}
if (i ==1){
results <- things
}else{
results <- rbind(results, things)
}
}
# Applies response variables to the DOE
inputdf <- data.frame(t(DOE))
repdf <- inputdf[rep(seq_len(nrow(inputdf)),each = numReps),]
test <- repdf[1:nrow(repdf),]
test$y <- results[,5]
print('Data complete for regression analysis')
end_time4 <- Sys.time()
print(total4 <- end_time4 - start_time4)
return(test)
}
\ No newline at end of file
sigFactors <- function(mod){
# Diplay the top 10 significant factors of the model
focus <- data.frame(names(mod$coefficients[!is.na(mod$coefficients)]))
# https://stackoverflow.com/questions/16724148/getting-predictor-names-in-r-regression
focus$pvalue <- round(coef(summary(mod))[,4],5)
# https://stackoverflow.com/questions/23838937/extract-pvalue-from-glm
focus$coeff <- round(coef(summary(mod))[,1],5)
colnames(focus)[1] <- 'Factors'
focus <- focus[!focus$Factors == '(Intercept)',]
impFac <- focus[focus$pvalue < alpha,]
top10 <- impFac[order(abs(impFac$coeff), decreasing = TRUE),]
top10$coeff <- format(top10$coeff, scientific = FALSE)
top10 <- top10[complete.cases(top10),]
print(top10[1:n,])
if(nrow(top10) < n){
print('More iterations or adjustments to inital settings maybe needed to find more or different significant factors')
}else{
print(paste0('These are the top ',n, ' out of ',nrow(top10), ' significant factors.'))
}
return(datatable(top10))
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment