# Subroutine to perform clustering per the CASANOVA algorithm
# CASANOVA version 1.0
# First Release 03-05-19
Fx_cluster <- function(DIR = getwd(), dataSetName, DetLim_pos, DetLim_neg, PVAL, CASRN=NA, S.vec=NA, dose.groupings, col.names, start.conc.idx){

# Helper functions
asNumeric <- function(x) suppressWarnings(as.numeric(as.character(x)))

# Define function for getting the contrast string
getContrastStr <- function(idx1, idx2, doseLevel)
{
	if (doseLevel == 0)  {
		return (paste(as.character(idx1), "-", as.character(idx2)))
	} else {
		return(paste(as.character(idx1), ",", as.character(doseLevel), " - ", as.character(idx2), ",", as.character(doseLevel), sep=""))
	}
}

# Set constants - alpha levels
ALPHA_INTERACT <- 0.05
ALPHA_REPLICATE <- 0.05
ALPHA_NOISE <- 0.05
todayStr <- format(Sys.Date(), format="%Y-%m-%d")

# Define output file name
outDSName = tools::file_path_sans_ext(dataSetName)
OUT_FILENAME <- paste("ClusterResult_", outDSName, "_", todayStr, ".dat", sep="")

# Needs LOTS of memory to run the large files
options(java.parameters = "-Xmx4g" )


# Open the data file
setwd(DIR)
data.mat = read.delim(dataSetName, header=TRUE, quote="");

data.mat2 = cbind(CASRN, S.vec, DetLim_pos, DetLim_neg, data.mat);

MSE.vec  =rep(NA, nrow(data.mat));
CV.vec   =rep(NA, nrow(data.mat));
range.vec=rep(NA, nrow(data.mat));

# Order chemicals by substance ID, and add column with unique number for each substance ID
pos.order = 1:nrow(data.mat2);
data.mat3 = data.mat2[pos.order, ];

Substance_ID = as.character(data.mat2[,2]);
origData = cbind(data.mat3[,1], Substance_ID, data.mat3);

num.conc <- length(dose.groupings)

# Create the respList and concList
concList <- vector(length = num.conc)
respList <- vector(length = num.conc)
conc.idx <- start.conc.idx
for (i in 1:num.conc) {
	concList[i] <- paste(col.names[1],conc.idx, sep = '')
	respList[i] <- paste(col.names[2],conc.idx, sep = '')
	conc.idx <- conc.idx+1
}

# Get the number of dose levels - based on the max grouping number that was passed in
num.dose.levels = max(dose.groupings)
# Create the list of lists of response columns in each grouping
doseList <- list(length = num.dose.levels)
for (i in 1:num.dose.levels) {
	doseList[i] <- list(respList[which(dose.groupings == i)])
}

# Require certain R libraries
require(lsmeans)

# Throw out lines where sample name is blank
origData <- subset(origData, !(is.na(origData$Sample.Name)))

# Create replicate column, just use original row numbers
origData$replicate <- as.character(rownames(origData))

# Get rid of NA's
if("conc0" %in% colnames(origData))
{
	origData <- subset(origData, !(is.na(origData$conc0)))
}

# Make sure all response columns are numeric
origData <- modifyList(origData, lapply(origData[, respList], asNumeric))

# Set flag if all variables are less than noise threshold limits
origData$allNoise <- 1
for (tmpRsp in respList)  {
	origData$allNoise[which(origData[tmpRsp]> origData[,5] | origData[tmpRsp] < origData[,6])] <- 0
}

# Transpose data to get one row per data value
longData <- reshape(origData[which(origData$allNoise == 0),], varying = respList, v.names = "resp", 
					timevar = "timept", times = respList, new.row.names = NULL, direction = "long",
					drop = concList)

# Drop the NA's
longData <- longData[which(!is.na(longData$resp)),]

# Assign group to doses (resp0-4 are dose 1, resp5-9 are dose 2, 10-14 are dose 3)
longData$doseLevel <- "0"

for (i in 1:num.dose.levels) {
	longData$doseLevel[which(longData$timept %in% doseList[[i]])] <- toString(i)
}

# Compute overall means and dose 1,2,3 means and store in separate data frames. Note these are all per-replicate.
groupMeans <- aggregate(longData[,"resp"], by = list(longData$replicate), FUN = mean)
doseMeans <- list(length = num.dose.levels)
for (i in 1:num.dose.levels) {
	doseMeans[[i]] <- data.frame(aggregate(longData[which(longData$doseLevel == toString(i)),"resp"], 
						by = list(longData[which(longData$doseLevel == toString(i)),"replicate"]),
						FUN = mean))
}

# Change variable name to replicate for each of the above
colnames(groupMeans)[which(colnames(groupMeans) == "Group.1")] = "replicate"
for (i in 1:num.dose.levels) {
	colnames(doseMeans[[i]])[which(colnames(doseMeans[[i]]) == "Group.1")] = "replicate"
}

# Add dose level identifier
for (i in 1:num.dose.levels) {
	doseMeans[[i]]$doseLevel <- toString(i)
}

# Get unique list of sample names from the original data frame
sampleList <- unique(longData[,"Sample.Name"])

# Loop through each sample and analyze separately -- do separate AOV for each sample 
for (i in 1:length(sampleList))
{
	# Get dose level means for just this sample
	replicateList <- unique(longData[which(longData$Sample.Name == sampleList[i]), "replicate"])
	numReps <- length(replicateList)

	MSE = NA;
	CV = NA;
	a.range = NA;

	results <- tryCatch({

		# Run ANOVA with interaction, and get p-values for interaction and replicate terms
		if (numReps > 1)  {
			# Get subset of longitudinal data for just this sample
			tmpData <- longData[which(longData$replicate %in% replicateList),]
			# Get the overall replicate means and the means for each dose level
			tmpGroupMeans <- groupMeans[which(groupMeans$replicate %in% replicateList),]
			tmpDoseMeans <- list()
			for (j in 1:num.dose.levels) {
				tmpDoseMeans[[j]] <- data.frame(doseMeans[[j]][which(doseMeans[[j]]$replicate %in% replicateList),])
			}

			# Run lsmeans to get contrasts
			tmpData.lm <- lm(resp ~ doseLevel * replicate, data = tmpData)
			tmpData.lsmRep <- suppressMessages(lsmeans(tmpData.lm, pairwise ~ replicate, adjust = "none"))
			tmpData.lsmDose <- suppressMessages(lsmeans(tmpData.lm, pairwise ~ replicate * doseLevel, adjust = "none"))
			repSummary <- summary(tmpData.lsmRep)
			doseSummary <- summary(tmpData.lsmDose)
			
			# Run aov to get ANOVA p-values for replicate and interaction term
			fit <- aov(resp ~ doseLevel * replicate, data = tmpData)
			#summary(fit)
			interact <- summary(fit)[[1]]["doseLevel:replicate", "Pr(>F)"]
			replEffect <- summary(fit)[[1]]["replicate", "Pr(>F)"]
			MSE <- mean(fit$residuals^2) 

			a.pre_chem=vector();
			for(a.pre in 1:nrow(tmpData)){
				a.pre_chem[a.pre] = strsplit(rownames(tmpData), ".resp")[[a.pre]][1]
			}
			a.pre_chem=unique(a.pre_chem);

			a.range=vector();
			for(a.pre2 in 1:length(a.pre_chem)){
				pos.tmp.a.pre2 = grep(paste(a.pre_chem[a.pre2], ".resp", sep=""), rownames(tmpData))
				a.range[a.pre2] = max(tmpData$resp[pos.tmp.a.pre2], na.rm=TRUE) - min(tmpData$resp[pos.tmp.a.pre2], na.rm=TRUE)
			}
			CV <- sqrt(MSE)/mean(a.range)
		} else  {
			# With only one replicate, can't run ANOVA
			interact <- 1
			replEffect <- 1
		}

		# If the interaction term is significant, we need to check for differences in each dose group (1,2, and 3)
		if (interact < ALPHA_INTERACT)
		{
			 # Initialize cluster number, these should not end up in the final dataset
			for (j in 1:num.dose.levels) {
				tmpDoseMeans[[j]]$cluster <- 0
			}

			# If the replicate effect was also significant, check for replicate differences before checking for dose differences
			if (replEffect < ALPHA_REPLICATE)
			{
				# Assign cluster numbers based on p-values
				
				# Sort mean data by mean value
				sort.groupMeans <- tmpGroupMeans[order(tmpGroupMeans$x),]
				prevCluster <- 1
	
				# Loop through the replicates in the sorted order			
				lastSameClusterIdx <- 1
				for (j in 1:numReps)
				{
					# Initialize variables before looping through the replicates
					lastRep <- sort.groupMeans[j, "replicate"]
					sameCluster <- FALSE
					k <- j+1
					
					# Check if all of the remaining replicates are significantly different from the current one
					while (k <= numReps)
					{
						checkPValid <- FALSE
						currRep <- sort.groupMeans[k, "replicate"]
						# Need to check for the contrast string in either direction
						contrastStr <- getContrastStr(lastRep, currRep, 0)
						if (!contrastStr %in% repSummary[[2]][,"contrast"]) {
							contrastStr <- getContrastStr(currRep, lastRep, 0)
						}
						# Get the p-value associated with this contrast
						row <- which(repSummary[[2]]$contrast == contrastStr)
						checkP <- repSummary[[2]][row,"p.value"]
						# As soon as one contrast is not significantly different, we can know we won't be incrementing the cluster index so we can exit the loop
						if (checkP > ALPHA_REPLICATE) {
							sameCluster <- TRUE
							lastSameClusterIdx <- k
						}
						k <- k+1
					}
					
					# Assign cluster index back into the data frames
					origData[which(origData$replicate == lastRep), "cluster"] = prevCluster
					for (m in 1:num.dose.levels) {
						tmpDoseMeans[[m]][which(tmpDoseMeans[[m]]$replicate == lastRep), "cluster"] = prevCluster
					}

					# Increment cluster index if they were all significantly different from the current replicate
					if (!sameCluster && lastSameClusterIdx < j+1)  prevCluster <- prevCluster + 1
				}
			}

			# Now look at contrasts for dose averages, starting with the high group
			sort.doseMeans <- list()
			
			# Start with highest dose level and perform 2-sided t-tests using the pooled SD
			for (doseLevel in num.dose.levels:1)
			{
				# Sort by existing cluster index (set above) and then by value
				sort.doseMeans[[doseLevel]] <- tmpDoseMeans[[doseLevel]][order(tmpDoseMeans[[doseLevel]]$cluster, tmpDoseMeans[[doseLevel]]$x),]
				prevCluster <- 1
				lastSameClusterIdx <- 1
				
				# Loop through the replicates in the sorted order			
				for (j in 1:numReps)
				{
					# Check if all of the following replicates are significantly different
					lastRep <- sort.doseMeans[[doseLevel]][j, "replicate"]
					sameCluster <- FALSE
					k <- j+1
					while (k <= numReps)
					{
						# If the new replicate is already in a different cluster, no need to check it
						if (sort.doseMeans[[doseLevel]][k, "cluster"] != sort.doseMeans[[doseLevel]][j, "cluster"] ) {
							break
						}
						
						checkPValid <- FALSE
						currRep <- sort.doseMeans[[doseLevel]][k, "replicate"]
						# Need to check for the contrast string in either direction
						contrastStr <- getContrastStr(lastRep, currRep, doseLevel)
						if (!contrastStr %in% doseSummary[[2]][,"contrast"]) {
							contrastStr <- getContrastStr(currRep, lastRep, doseLevel)
						}
						# Get the p-value associated with this contrast
						row <- which(doseSummary[[2]]$contrast == contrastStr)
						checkP <- doseSummary[[2]][row,"p.value"]
						# As soon as one contrast is not significantly different, we can know we won't be incrementing the cluster index so we can exit the loop
						if (checkP > ALPHA_REPLICATE) {
							sameCluster <- TRUE
							lastSameClusterIdx <- k
						}
						k <- k+1
					}
					# Save current cluster
					origData[which(origData$replicate == lastRep), "cluster"] = prevCluster
					m <- 1
					while (m < doseLevel) {
						tmpDoseMeans[[m]][which(tmpDoseMeans[[m]]$replicate == lastRep), "cluster"] = prevCluster
						m <- m+1
					}

					# Increment cluster if the next group is different
					if (!sameCluster && lastSameClusterIdx < j+1)  prevCluster <- prevCluster + 1
				}
			}
			
		}
		else   # interaction not significant
		{
			if (replEffect < ALPHA_REPLICATE)
			{
				# Ignore the dose level, check for replicate only
				prevCluster <- 1
				
				# Sort by the mean value
				sort.groupMeans <- tmpGroupMeans[order(tmpGroupMeans$x),]
				
				# Loop through the replicates in the sorted order			
				lastSameClusterIdx <- 1
				for (j in 1:numReps)
				{
					# Check if all of the following replicates are significantly different
					lastRep <- sort.groupMeans[j, "replicate"]
					sameCluster <- FALSE
					k <- j+1
					# We will only increment the cluster if all of the following replicates are significantly different from this one
					while (k <= numReps)  # Note this loop gets skipped when j = numReps
					{
						# Get the replicate id
						currRep <- sort.groupMeans[k, "replicate"]
						checkPValid <- FALSE
						# Get the contrast p-value
						contrastStr <- getContrastStr(lastRep, currRep, 0)
						if (!contrastStr %in% repSummary[[2]][,"contrast"]) {
							contrastStr <- getContrastStr(currRep, lastRep, 0)
						}
						# Get the p-value for this comparison
						row <- which(repSummary[[2]]$contrast == contrastStr)
						checkP <- repSummary[[2]][row,"p.value"]

						if (checkP > ALPHA_REPLICATE) {
							sameCluster <- TRUE
							lastSameClusterIdx <- k
						}
						k <- k+1
					}
					# Assign current cluster index
					origData[which(origData$replicate == lastRep), "cluster"] = prevCluster

					# Increment cluster index if this was not the same
					if (!sameCluster && lastSameClusterIdx < j+1)  prevCluster <- prevCluster + 1
				}
			}
			else  # replicate effect not significant
			{
				# Nothing to do here, all the replicates get the sample cluster number
				origData[which(origData$replicate %in% replicateList), "cluster"] = 1
			}
		}
		
	}, error = function(err)  {
		print(paste("Error in sample", i, ":", sampleList[i], err))
	}) #end of trycatch

	# Assign MSE, CV, and range values values
	MSE.vec[which(origData$replicate %in% replicateList)] = MSE;
	CV.vec[which(origData$replicate %in% replicateList)] = CV;
	range.vec[which(origData$replicate %in% replicateList)] = mean(a.range);
	fit=NA;  model.lm=NA;
	rm(fit); rm(model.lm);

	# For each cluster, determine if it is significantly outside of the noise band
	# Loop through the clusters within this sample
	maxCluster <- max(origData[which(origData$replicate %in% replicateList), "cluster"])
	for (clusterIdx in 1:maxCluster) {
		# Get subset of data from this cluster
		clusterReps <- origData[which(origData$replicate %in% replicateList & origData$cluster == clusterIdx), "replicate"]
		# Get subset of longitudinal data for just this cluster
		tmpLongData <- longData[which(longData$replicate %in% clusterReps),]
		# Get the points that are above the noise threshold
		tmpLongData$posResid <- tmpLongData$resp - tmpLongData$DetLim_pos
		tmpLongData$negResid <- tmpLongData$resp - tmpLongData$DetLim_neg  # will be negative is below negative limit
		posResiduals <- tmpLongData[which(tmpLongData$posResid > 0), "posResid"]
		negResiduals <- tmpLongData[which(tmpLongData$negResid < 0), "negResid"]
		# Get the p-value for the t-test of these points being above the noise threshold
		if (length(posResiduals) < 2) {
			pos.ttest.pval <- 1
		} else {
			pos.ttest.pval <- t.test(posResiduals, mu = 0, alternative = "greater")$p.value
		}
		if (length(negResiduals) < 2) {
			neg.ttest.pval <- 1
		} else {
			neg.ttest.pval <- t.test(negResiduals, mu = 0, alternative = "less")$p.value
		}
		# Set the clusterSig to 1 if either of the p-values is less than .05
		origData[which(origData$replicate %in% replicateList & origData$cluster == clusterIdx), "clusterSig"] <- as.integer((min(pos.ttest.pval, neg.ttest.pval) < ALPHA_NOISE))
	}
}

# Go back through the samples and classify the clustering
# CASE2 = All noise, CASE1_CONCL = One cluster, different from noise, CASE1_INCONCL = One cluster inconclusive, CASE3_CONCL = Multiple clusters, CASE3_INCONCL = Multiple Clusters inconclusive
sampleList <- unique(origData$Sample.Name)
for (i in 1:length(sampleList)) {
	# Subset the data
	tmpData <- origData[which(origData$Sample.Name == sampleList[i]),]
	# noiseCluster is 1 if any of the replicates are all noise, otherwise 0
	noiseCluster <- as.integer(nrow(tmpData[which(tmpData$allNoise == 1),]) > 0)
	# numNonNoise is the number of clusters that are significantly outside of the noise band
	numNonNoise <- length(unique(tmpData[which(tmpData$clusterSig == 1), "cluster"]))
	# numClusters is just the overall number of unique clusters, including an all-noise cluster if it exists
	numClusters <- length(unique(tmpData$cluster))
	if (noiseCluster == 1 & numClusters == 1) {
		origData[which(origData$Sample.Name == sampleList[i]), "CODE"] <- "CASE2"
	} else if (noiseCluster == 0 & numClusters == 1 && numNonNoise == 1)  {
		origData[which(origData$Sample.Name == sampleList[i]), "CODE"] <- "CASE1_CONCL"
	} else if (noiseCluster == 0 & numClusters == 1 && numNonNoise == 0)  {
		origData[which(origData$Sample.Name == sampleList[i]), "CODE"] <- "CASE1_INCONCL"
	} else if (noiseCluster + numNonNoise >= 2)  {
		origData[which(origData$Sample.Name == sampleList[i]), "CODE"] <- "CASE3_CONCL"
	} else if (numNonNoise >= 1 & numClusters >= 2)  {
		origData[which(origData$Sample.Name == sampleList[i]), "CODE"] <- "CASE3_CONCL"
	} else {
		origData[which(origData$Sample.Name == sampleList[i]), "CODE"] <- "CASE3_INCONCL"
	}
}
# Don't need the clusterSig column
origData$clusterSig <- NA

# Export the original data with the cluster column added
write.table(cbind(origData[, !(names(origData) %in% c("NA.", "replicate", "clusterSig"))], MSE.vec, range.vec, CV.vec), OUT_FILENAME, row.names = FALSE, sep="\t")
print(paste("Output written to ", OUT_FILENAME))

} #end function



