getwd() temp <- read.csv("/Volumes/secure/SecureFromPC/MountSinai/BakedMilk2/Lab Meeting 20090818/20080818_BaselineBasoDataWithClinicalGroups.csv") dim(temp) attributes(temp) summary(temp) #summary data for temp # a histogram of the values of Clinical_Group to examine data distribution hist(temp$Clinical_Group) # Exclude the rows where there is no value for Clinical_Group ExcludeNoClin <- subset(temp, !is.na(Clinical_Group)) # Alternative method that I didn't use # How to deal with values that are already marked as missing? John Fox says: # x2 <- na.omit(x) # produces a copy of the data frame x with all rows that contain missing data removed. The function na.exclude could be used also. For more information on missings, check help: ?na.exclude. # Keep the rows where the stimulant was a concentration of Milk MilkStimOnly <- subset(ExcludeNoClin, stim %in% c("Milk1","Milk2","Milk3", "Milk4", "Milk5")) # Make a vector from the experiment column of MilkStimOnly Subject <- MilkStimOnly["experiment"] CG <- MilkStimOnly["Clinical_Group"] stim <- MilkStimOnly["stim"] # Quadrant 2 contains events counted as activated basophils (because they are CD63+, CD203c+), Q2 denotes the % of all the events that these form PercentActBaso <- MilkStimOnly["Q2"] # Make a new data frame with these vectors only SimpleDF <- data.frame(Subject, CG, stim, PercentActBaso) # Shows that SimpleDF$stim is of class "factor" with levels all the original stimulants attributes(SimpleDF$stim) # remove the levels from the factor SimpleDF$stim that have no observations with that value (unstained, Anti-IgE etc...). Don't ask me how this works, it's straight from google. SimpleDF$stim <- SimpleDF$stim[which(SimpleDF$stim %in% names(table(SimpleDF$stim))[table(SimpleDF$stim) >= 1]), drop=TRUE] # Check that SimpleDF$stim is of class "factor" with levels now just Milk 1-5 attributes(SimpleDF$stim) # plotting per anova example in Faraway's book p182 plot (SimpleDF$Q2 ~ SimpleDF$Clinical_Group + SimpleDF$stim, data=SimpleDF) # these two lines look for interaction between the variables Clinical Group and stim ... I think that evidence of no interaction would be several parallel lines ... instead, mine converge ... let's push on anyway interaction.plot(SimpleDF$stim,SimpleDF$Clinical_Group,SimpleDF$Q2) interaction.plot(SimpleDF$Clinical_Group,SimpleDF$stim,SimpleDF$Q2) # fit the model, run the anova, again per Faraway g <- lm(Q2 ~ Clinical_Group*stim, SimpleDF) anova(g) # The main effect "stim" is significant, but "Clinical_Group" is not. Interaction is also not significant # Plotting the residuals shows a cone-shaped distribution. Maybe the outcome should be log-transformed since milk dilutions were 10-fold each time? qqnorm(g$res) plot(g$fitted,g$res,xlab="Fitted",ylab="Residuals") g <- lm(log(Q2) ~ Clinical_Group*stim, SimpleDF) # Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) : NA/NaN/Inf in foreign function call (arg 4) # Hmmm .. why the error # Oh ... it's because I'm taking logs of some zero values ... make them NA (so they will be ignored) SimpleDF_Log <- SimpleDF SimpleDF_Log$Q2[SimpleDF_Log$Q2 == 0] <- NA # re-make the model h <- lm(log(Q2) ~ Clinical_Group*stim, SimpleDF_Log) plot(h$fitted,h$res,xlab="Fitted",ylab="Residuals",main="Log response") # After log-transforming, plotting the residuals shows a more random cloud-shaped distribution. qqnorm(h$res) # and the qq is more linear ... I think that's right # now re-do analysis with new model anova(h) # Still, the main effect "stim" is significant, but "Clinical_Group" is not. Interaction is also not significant # Maybe there aren't enough subjects in each group ... let's combine the people who tolerated ANY heated milk PooledCG <- SimpleDF_Log PooledCG$Clinical_Group[PooledCG$Clinical_Group %in% c(3, 4, 5)] <- 2 # Check that Clinical_Group is numeric and not a factor class(PooledCG$Clinical_Group) # make a new model g <- lm(log(Q2) ~ Clinical_Group*stim, PooledCG) # plot the residuals plot(g$fitted,g$res,xlab="Fitted",ylab="Residuals",main="Log response") # check the QQ qqnorm(g$res) # looks good # re-do analysis anova(g) # still not significant between clincial groups # "slippery slope" question ... bigger difference at any point along dose-response curve? SimpleDF_Log <- SimpleDF SimpleDF_Log$Q2[SimpleDF_Log$Q2 == 0] <- NA interaction.plot(SimpleDF$stim,SimpleDF$Clinical_Group,SimpleDF$Q2) # interaction.plot calls the mean function, which does not allow NA values. Therefore this does not plot any clinical groups with NA values interaction.plot(SimpleDF_Log$stim,SimpleDF_Log$Clinical_Group,log(SimpleDF_Log$Q2)) # change the stims back to 0 where where % activated basos is >5% for any milk concentration or anti-IgE - these are "valid"/"responders". I established which these were by hand inspection - BM008, BM021 and BM033 are included; BM020, BM030 and BM043 are excluded. PooledCG$Q2[39] <- 0 #BM008 PooledCG$Q2[93] <- 0 #BM021 PooledCG$Q2[148] <- 0 #BM033 PooledCG$Q2[149] <- 0 #BM033 # exclude the rows for the subjects where % activated basos is <5% for all the milks and anti-IgE PooledCGResponders <- subset(PooledCG, (experiment != "BM020") & (experiment != "BM030") & (experiment != "BM043")) dim(PooledCG) dim(PooledCGResponders) #should be 15 fewer rows # find ranges % activated basos. If these vectors contained NAs we would need to exclude them for range and median to work by including na.rm=TRUE as an argument. It is redundant below, because I know there are no NAs, but I left it in to remember this point. range(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk1"], na.rm=TRUE) range(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk5"], na.rm=TRUE) range(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 1], na.rm=TRUE) range(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 2], na.rm=TRUE) # find medians % activated basos - need to exclude NAs (which came from 0's) for this function to work median(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk1"], na.rm=TRUE) median(PooledCGResponders$Q2[PooledCGResponders$stim == "Milk5"], na.rm=TRUE) median(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 1], na.rm=TRUE) median(PooledCGResponders$Q2[PooledCGResponders$Clinical_Group == 2], na.rm=TRUE) # put the 0s back to NA so we can take logs PooledCGResponders$Q2[PooledCGResponders$Q2 == 0] <- NA # make a new model g <- lm(log(Q2) ~ Clinical_Group*stim, PooledCGResponders) # plot the residuals plot(g$fitted,g$res,xlab="Fitted",ylab="Residuals",main="Log response") # check the QQ qqnorm(g$res) # looks good # re-do analysis anova(g) # still not significant between clincial groups Milk1Only <- subset(PooledCGResponders, stim == "Milk1") # should have 32 rows dim(Milk1Only) # Wilcoxon with one-tailed test for difference in milk activation between tolerant and reactive subj. Chose Wilcoxon (non-parametric) over Student's t-test (parametric) because the HM-reactive group has only 6 subjects. wilcox.test(Milk1Only$Q2 ~ Milk1Only$Clinical_Group, alternative = "g")