getwd()
temp <- read.csv("/Volumes/secure/SecureFromPC/MountSinai/BakedMilk2/Lab Meeting 20090818/20080818_BaselineBasoDataWithClinicalGroups.csv")
dim(temp)
attributes(temp)
summary(temp) #summary data for temp

# a histogram of the values of Clinical_Group to examine data distribution
hist(temp$Clinical_Group)

# Exclude the rows where there is no value for Clinical_Group
ExcludeNoClin <- subset(temp, !is.na(Clinical_Group))

# Alternative method that I didn't use
# How to deal with values that are already marked as missing? John Fox says:
# x2 <- na.omit(x)
# produces a copy of the data frame x with all rows that contain missing data removed. The function na.exclude could be used also. For more information on missings, check help: ?na.exclude.

# Keep the rows where the stimulant was a concentration of Milk
MilkStimOnly <- subset(ExcludeNoClin, stim %in% c("Milk1","Milk2","Milk3", "Milk4", "Milk5"))

# Make a vector from the experiment column of MilkStimOnly
Subject <- MilkStimOnly["experiment"]
CG <- MilkStimOnly["Clinical_Group"]
stim <- MilkStimOnly["stim"]
# Quadrant 2 contains events counted as activated basophils (because they are CD63+, CD203c+), Q2 denotes the % of all the events that these form
PercentActBaso <- MilkStimOnly["Q2"]

# Make a new data frame with these vectors only
SimpleDF <- data.frame(Subject, CG, stim, PercentActBaso)

# Shows that SimpleDF$stim is of class "factor" with levels all the original stimulants
attributes(SimpleDF$stim)

# remove the levels from the factor SimpleDF$stim that have no observations with that value (unstained, Anti-IgE etc...). Don't ask me how this works, it's straight from google.
SimpleDF$stim <- SimpleDF$stim[which(SimpleDF$stim %in% names(table(SimpleDF$stim))[table(SimpleDF$stim) >= 1]), drop=TRUE]

# Check that SimpleDF$stim is of class "factor" with levels now just Milk 1-5
attributes(SimpleDF$stim)

# plotting per anova example in Faraway's book p182
plot (SimpleDF$Q2 ~ SimpleDF$Clinical_Group + SimpleDF$stim, data=SimpleDF)

# these two lines look for interaction between the variables Clinical Group and stim ... I think that evidence of no interaction would be several parallel lines ... instead, mine converge ... let's push on anyway
interaction.plot(SimpleDF$stim,SimpleDF$Clinical_Group,SimpleDF$Q2)
interaction.plot(SimpleDF$Clinical_Group,SimpleDF$stim,SimpleDF$Q2)

# fit the model, run the anova, again per Faraway
g <- lm(Q2 ~ Clinical_Group*stim, SimpleDF)
anova(g)

# The main effect "stim" is significant, but "Clinical_Group" is not. Interaction is also not significant

# Plotting the residuals shows a cone-shaped distribution. Maybe the outcome should be log-transformed since milk dilutions were 10-fold each time?
qqnorm(g$res)
plot(g$fitted,g$res,xlab="Fitted",ylab="Residuals")

g <- lm(log(Q2) ~ Clinical_Group*stim, SimpleDF)
# Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : 
  NA/NaN/Inf in foreign function call (arg 4)
# Hmmm .. why the error
# Oh ... it's because I'm taking logs of some zero values ... make them NA (so they will be ignored)

SimpleDF_Log <- SimpleDF
SimpleDF_Log$Q2[SimpleDF_Log$Q2 == 0] <- NA
# re-make the model
h <- lm(log(Q2) ~ Clinical_Group*stim, SimpleDF_Log)

plot(h$fitted,h$res,xlab="Fitted",ylab="Residuals",main="Log response")
# After log-transforming, plotting the residuals shows a more random cloud-shaped distribution. 
qqnorm(h$res)
# and the qq is more linear ... I think that's right

# now re-do analysis with new model
anova(h)
# Still, the main effect "stim" is significant, but "Clinical_Group" is not. Interaction is also not significant

# Maybe there aren't enough subjects in each group ... let's combine the people who tolerated ANY heated milk

PooledCG <- SimpleDF_Log
PooledCG$Clinical_Group[PooledCG$Clinical_Group %in% c(3, 4, 5)] <- 2
# Check that Clinical_Group is numeric and not a factor
class(PooledCG$Clinical_Group)

# make a new model
g <- lm(log(Q2) ~ Clinical_Group*stim, PooledCG)

# plot the residuals
plot(g$fitted,g$res,xlab="Fitted",ylab="Residuals",main="Log response")

# check the QQ
qqnorm(g$res)
# looks good

# re-do analysis
anova(g)
# still not significant between clincial groups

# "slippery slope" question ... bigger difference at any point along dose-response curve?
SimpleDF_Log <- SimpleDF
SimpleDF_Log$Q2[SimpleDF_Log$Q2 == 0] <- NA
interaction.plot(SimpleDF$stim,SimpleDF$Clinical_Group,SimpleDF$Q2)
# interaction.plot calls the mean function, which does not allow NA values. Therefore this does not plot any clinical groups with NA values
interaction.plot(SimpleDF_Log$stim,SimpleDF_Log$Clinical_Group,log(SimpleDF_Log$Q2))

# change the stims back to 0 where where % activated basos is >5% for any milk concentration or anti-IgE - these are "valid"/"responders". I established which these were by hand inspection - BM008, BM021 and BM033 are included; BM020, BM030 and BM043 are excluded.
PooledCG$Q2[39] <- 0 #BM008
PooledCG$Q2[93] <- 0 #BM021
PooledCG$Q2[148] <- 0 #BM033
PooledCG$Q2[149] <- 0 #BM033

# exclude the rows for the subjects where % activated basos is <5% for all the milks and anti-IgE
PooledCGResponders <- subset(PooledCG, (experiment != "BM020") & (experiment != "BM030") & (experiment != "BM043"))

dim(PooledCG)
dim(PooledCGResponders) #should be 15 fewer rows

# find ranges % activated basos. If these vectors contained NAs we would need to exclude them for range and median to work by including na.rm=TRUE as an argument. It is redundant below, because I know there are no NAs, but I left it in to remember this point.
range(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk1"], na.rm=TRUE)
range(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk5"], na.rm=TRUE)
range(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 1], na.rm=TRUE)
range(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 2], na.rm=TRUE)

# find medians % activated basos - need to exclude NAs (which came from 0's) for this function to work
median(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk1"], na.rm=TRUE)
median(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk5"], na.rm=TRUE)
median(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 1], na.rm=TRUE)
median(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 2], na.rm=TRUE)

# put the 0s back to NA so we can take logs
PooledCGResponders$Q2[PooledCGResponders$Q2 == 0] <- NA
# make a new model
g <- lm(log(Q2) ~ Clinical_Group*stim, PooledCGResponders)

# plot the residuals
plot(g$fitted,g$res,xlab="Fitted",ylab="Residuals",main="Log response")

# check the QQ
qqnorm(g$res)
# looks good

# re-do analysis
anova(g)
# still not significant between clincial groups

Milk1Only <- subset(PooledCGResponders, stim == "Milk1")
# should have 32 rows
dim(Milk1Only)

# Wilcoxon with one-tailed test for difference in milk activation between tolerant and reactive subj. Chose Wilcoxon (non-parametric) over Student's t-test (parametric) because the HM-reactive group has only 6 subjects.
wilcox.test(Milk1Only$Q2 ~ Milk1Only$Clinical_Group, alternative = "g")