}
analyze_data = function(data, alpha=0.05){
t.test(y ~ group, data=data, alpha=alpha)
}
df1 <- sample_data(X, 1, 2)
analyze_data(df1)
rnorm(20, 0, 0)
sample
colors() %>% substr(1, 3)
colors()
?rpois()
iris
iris = 2
iris
rm('iris')
iris
# clear out the global environment
rm(list=ls())
#for prettier printing of some values
options(digits=4)
# these will be set for the current graphics device and will be reset to default values
# if we close out the plot window
par(pch=16, las=1, col='gray30')
m1 = lm(Sepal.Length ~ Sepal.Width, data=iris)
summary(m1)
1.44^2
john
Sepal.Length
iris$Sepal.Length
attach(iris)
Sepal.Length
Sepal.Length = 40
rm('Sepal.Length')
summary(m1)
plot(Sepal.Width, Sepal.Length)
abline(m1, lwd=3, col='orangered')
colors = c('purple4','orange', 'dodgerblue3')
iris_colors = colors[Species]
iris_colors
cbind(Species, iris_colors)
plot(Sepal.Width, Sepal.Length, col=iris_colors)
text(c(4,2.5,3.5), c(6, 5.3, 7), unique(Species), col=colors, cex=1.5)
plot(predict(m1), resid(m1), col=iris_colors, pch=19)
abline(h=0)
par(mfrow=c(2,2))
plot(m1)
par(mfrow=c(2,2))
plot(m1, col=iris_colors)
plot(predict(m1), resid(m1), col=iris_colors, pch=19)
abline(h=0)
par(mfrow=c(1,1))
plot(predict(m1), resid(m1), col=iris_colors, pch=19)
abline(h=0)
identify(predict(m1), resid(m1))
plot(predict(m1), resid(m1), col=iris_colors, pch=19)
abline(h=0)
identify(predict(m1), resid(m1))
.Last.value
coef(m1)
model.matrix(m1)
image(model.matrix(m1))
image(model.matrix(m1), col=gray.colors(100))
image(t(model.matrix(m1)), col=gray.colors(100))
plot(Sepal.Width, Sepal.Length)
abline(m1, lwd=3, col='orangered')
predict(m1, newdata = data.frame(Sepal.Width=c(2.4, 3.2)))
yhat = predict(m1, newdata = data.frame(Sepal.Width=c(2.4, 3.2)))
test_widths = c(2.4, 3.2)
test_widths = c(2.4, 3.2)
yhat = predict(m1, newdata = data.frame(Sepal.Width=test_widths))
points(test_widths, yhat, pch=1, col='orange', lwd=2)
points(test_widths, yhat, pch=1, col='orange', lwd=2)
points(test_widths, yhat, pch=1, col='orange', lwd=2, cex=2)
points(test_widths, yhat, pch=3, col='orange', lwd=2, cex=2)
seq(2, 4, by=0.2)
plot(Sepal.Width, Sepal.Length)
# draw the line of best fit!
abline(m1, lwd=2, col='dodgerblue3')
iris_with_res = iris
iris_with_res$Sepal.Length_hat = fitted(m1)
View(iris_with_res)
with(iris_with_res,
plot(Sepal.Length_hat, Sepal.Length, col=iris_colors))
unique(Species)
results = by(iris, Species, function(x) lm(Sepal.Length ~ Sepal.Width, data=x))
lapply(results, summary)
results[``]
results[1]
list(A)
list(a=2)
class(results)
plot(Sepal.Width, Sepal.Length, col=iris_colors)
for(ii in 1:3) {
abline(results[[ii]], col=colors[ii], lwd=2, lty=2)
}
plot(Sepal.Width, Sepal.Length, col=iris_colors)
for(ii in 1:3) {
abline(results[[ii]], col=colors[ii], lwd=2, lty=2)
}
summary(results[[1]])$r.squared
# do this a bunch of times
rsqs = sapply(results, function(x) summary(x)$r.squared)
rsqs = round(rsqs,2)
# or maybe the slopes are of interest
coef(results[[1]])[2]
slopes = sapply(results, function(x) coef(x)[2])
slopes = round(slopes, 2)
text(c(4,2.5,3.5), c(6, 5.3, 7.3), paste(unique(Species), ':', slopes), col=colors, cex=1.5)
pairs(iris, col=iris_colors)
require(ppcor)
cor.test(Petal.Length, Petal.Width)
cor(cbind(Petal.Length, Petal.Width, Species))
# new command! we're subsetting the iris data set,
# taking everything EXCEPT Species variable (the minus sign)
cor_mat = cor(subset(iris, select=-Species))
# now get partial correlations
pcor_mat = pcor(subset(iris, select=-Species))
hm_colors = colorRampPalette(c('purple4', 'white', 'goldenrod'))
yi = xi = 1:5
# new command for getting and setting the diagonal of a matrix
diag(cor_mat) = 0
par(mfrow=c(1:2))
image(t(cor_mat)[,5:1], col=hm_colors(255),
axes=F, x=1:5, y=1:5, asp=1, xlab='', ylab='', zlim=c(-1,1))
#       ?graphics::image
text(xi, rev(yi), c('SL', 'SW', 'PL', 'PW', 'Sp'))
diag(pcor_mat$estimate) = 0
image(t(pcor_mat$estimate)[,5:1], col=hm_colors(255),
axes=F, x=xi, y=yi, asp=1, xlab='', ylab='', zlim=c(-1,1))
text(xi, rev(yi), c('SL', 'SW', 'PL', 'PW', 'Sp'))
pcor_mat = pcor(subset(iris, select=-Species))
pcor(iris)
iris_with_res$Species = as.numeric(iris_with_res$Species)
pcor(iris)
pcor(iris_with_res)
iris$Species = as.numeric(iris_with_res$Species)
pcor(iris_with_res)
cor_mat = cor(iris)
yi = xi = 1:5
# new command for getting and setting the diagonal of a matrix
diag(cor_mat) = 0
par(mfrow=c(1:2))
image(t(cor_mat)[,5:1], col=hm_colors(255),
axes=F, x=1:5, y=1:5, asp=1, xlab='', ylab='', zlim=c(-1,1))
#       ?graphics::image
text(xi, rev(yi), c('SL', 'SW', 'PL', 'PW', 'Sp'))
diag(pcor_mat$estimate) = 0
image(t(pcor_mat$estimate)[,5:1], col=hm_colors(255),
axes=F, x=xi, y=yi, asp=1, xlab='', ylab='', zlim=c(-1,1))
text(xi, rev(yi), c('SL', 'SW', 'PL', 'PW', 'Sp'))
pcor_mat = cor(cbind(Petal.Length, Petal.Width, Species))
pcor_mat = pcor(iris_with_res)
yi = xi = 1:5
# new command for getting and setting the diagonal of a matrix
diag(cor_mat) = 0
par(mfrow=c(1:2))
image(t(cor_mat)[,5:1], col=hm_colors(255),
axes=F, x=1:5, y=1:5, asp=1, xlab='', ylab='', zlim=c(-1,1))
#       ?graphics::image
text(xi, rev(yi), c('SL', 'SW', 'PL', 'PW', 'Sp'))
diag(pcor_mat$estimate) = 0
image(t(pcor_mat$estimate)[,5:1], col=hm_colors(255),
axes=F, x=xi, y=yi, asp=1, xlab='', ylab='', zlim=c(-1,1))
text(xi, rev(yi), c('SL', 'SW', 'PL', 'PW', 'Sp'))
pcor(iris[,1:3])
iris()
attach(iris)
source('~/Untitled.R')
detach(iris)
detach(iris)
detach(iris)
attach(iris)
lm(Petal.Width ~ Species)
lm(Petal.Width ~ Species) %>% summary
m1 <- lm(Petal.Width ~ Species)
summary(m1)
m0 <- lm(Petal.Width ~ Species - 1)
summary(m0)
?add1(m1, )
swiss()
swiss
add1(m1, Sepal.Width)
add1(m1, ~ Sepal.Width)
mAll <- lm(Petal.Width ~ ., data=iris)
summary(mAll)
add1(mAll, ~ I(Sepal.Length^2) + .^2)
summary(add1(mAll, ~ I(Sepal.Length^2) + .^2))
add1(mAll, ~ I(Sepal.Length^2) + .^2)
summary(mAll)
add1(mAll, ~ I(Sepal.Length^2) + .^2, test='F')
drop1(mAll, ~ I(Sepal.Length^2) + .^2, test='F')
drop1(mAll, test='F')
add1(m0, ~ Sepal.Length)
add1(m0, ~ . + Sepal.Length)
add1(m0, ~ . + Sepal.Length, test='F')
summary(lm(Petal.Width ~ Species + Sepal.Length -1))
4.730 ^ 2
add1(m1, ~ Sepal.Width, test='F')
add1(m1, ~ Sepal.Width + . , test='F')
summary(mAll)
summary(m0)
summary(mAll)
names(swiss)
?swiss
summary(lm(Sepal.Width ~ Sepal.Length))
summary(lm(Sepal.Width ~ Sepal.Length*Species))
summary(lm(Sepal.Width ~ Sepal.Length + Species))
summary(lm(Sepal.Width ~ Sepal.Length + as.numeric(Species)))
my_iris = iris
my_iris
my_iris$Species = factor(my_iris$Species, levels=c('virginica', 'setosa', 'versicolor'))
lm(Sepal.Width ~ Species, data=my_iris)
summary(lm(Sepal.Width ~ Species, data=my_iris))
summary(lm(Sepal.Width ~ Species -1, data=my_iris))
anova(m0, m1)
anova(m0, mAll)
anova(mAll, m0)
BIC(mAll, m0)
anova(mAll, m0)
summary(lm(Sepal.Width ~ Species, data=my_iris))
iris$Species
?kmeans
?mds
??mds
?princomp
rm(list=ls())
# clustering
# Today we'll focus more on graphing and exploring, less on technical details
par(col='gray30', las=1, pch=19)
options(digits=4)
colors = c('purple4','orange', 'dodgerblue3')
iris_colors = colors[as.numeric(iris$Species)]
# k-means
names(iris)
# give it a matrix, cluster on the rows
# centers is the pre-specified number of centers
# nstart tells the algorithm how many times to REPEAT the entire clustering process (especially helpful with larger # of clusters)
iris.km3 = kmeans(iris[,1:4], centers=3, nstart=20)
plot(iris.km3$centers[1,], type='o', lwd=2, ylim=c(0, 10))
lines(iris.km3$centers[2,], type='o', lwd=2)
lines(iris.km3$centers[3,], type='o', lwd=2)
plot(jitter(iris.km3$cluster), jitter(as.numeric(iris$Species)), xlab='Predicted Cluster', ylab='Actual Cluster', col=iris_colors)
text(2.5, 2.5, "Note that the cluster numbers\ndon't have to line up with the species numbers\nWe're just looking for groupings")
source('~/Dropbox/scripts/basic_helper.R')
population = read.csv(file='~/Dropbox/MultisensoryIntegration/Papers/InferenceInMcGurk/data/us_mcg_only.csv', row.names=NULL)
keep_stim <- population %>% column_apply(not_NA) %>% column_apply(all) %>% which
population <- population[,keep_stim]
stimuli <- names(population) %>%
str_replace_all('_', '') %>%
str_replace_all('.avi', '') %>% abbreviate(5)
fusion_df <- data.frame(sid=rep(1:165, ncol(population)), mcg=population %>% unlist, stimulus=rep(stimuli, each=nrow(population)))
mean_fusions <- fusion_df %>% do_aggregate(mcg ~ sid, mean)
gen_samples <- function(mu, n_trials, MCG_RANGE) {
range(mu) <- MCG_RANGE
rbinom(1, n_trials, mu) %>% divide_by(n_trials)
}
get_subject_fusion <- function(idx, stim) {
if(length(idx) > 1) {
return (lapply(idx, get_subject_fusion, stim=stim) %>% rbind_list)
}
# because stim could be the same stim repeated
# use sapply instead of just %in% or other boolean selector
sbj_f <- fusion_df[fusion_df$sid == idx,]
sbj_f[stim  %>% sapply(which.equal,sbj_f$stimulus),]
}
design.unpaired <- function(N, n_trials, delta, n_stimuli, MCG_RANGE=c(0.1, 0.9)) {
#print(paste(N, delta, n_stimuli))
# we could also generate parameters from NED model then randomly pick stimulus disparities,
# but bootstrapping from a large N study should be good too
# first pick the stimulus/stimuli
stim <- sample(unique(fusion_df$stimulus), size=n_stimuli, replace=TRUE)
# now pick N subjects -- note that we are sampling WITH replacement, so we need to clean up the subject IDs later on
idx <- sample(unique(fusion_df$sid), size=N, replace = TRUE)
# get the predicted 'mu' for each of subject at the stimulus. To make sure we handle duplicate sid, just iterate
mu1 <- get_subject_fusion(idx, stim)
# fix the sid's in case of duplicates
mu1$sid <- rep('s_' %&% 1:N, each=n_stimuli)
# fix the stim's in case of duplicates
mu1$stimulus <- rep('stim_' %&% 1:n_stimuli, times=N)
# now for each subject+stimulus, generate a binomial sample with n_trials
y1 <- mu1 %>%
do_aggregate(mcg ~ sid + stimulus, gen_samples, n_trials=n_trials, MCG_RANGE=MCG_RANGE)
y1$group <- 'y1'
# for group 2, pick N more subjects and get estimated estimated fusion
idx <- sample(unique(fusion_df$sid), size=N, replace = TRUE)
mu2 <- get_subject_fusion(idx, stim)
# add delta to mu2 for each stimulus
mu2$mcg <- mu2$mcg %>% add(delta) %>% clip(limits=MCG_RANGE)
# fix the sid's in case of duplicates
mu2$sid <- rep('s_' %&% (N+1):(2*N), each=n_stimuli)
#stim id should match mu1
mu2$stimulus <- mu1$stimulus
# now for each subject+stimulus, generate a binomial sample with n_trials
y2 <- mu2 %>%
do_aggregate(mcg ~ sid + stimulus, gen_samples, n_trials=n_trials, MCG_RANGE=MCG_RANGE)
y2$group <- 'y2'
return(data.frame(rbind(y1, y2)))
}
simulate_mcgurk_study <- function(simulation_parameters) {
require(magrittr)
data <- simulation_parameters %$% design(sample_size, n_trials, delta, n_stimuli)
# if more than 1 stimulus, use an LME model with stim/sid as random factors
if((data$stimulus %>% unique %>% length) > 1) {
require(lmerTest)
m1 <- lmer(mcg ~ group + (1|sid) + (1|stimulus), data=data, REML=FALSE)
p_val <- m1 %>% summary %>% coef %>% {.[2,'Pr(>|t|)']}
} else {
# if 1 stimulus, just do a t-test (this is the most common test, if not the most appropriate)
p_val <- data %$% t.test(mcg ~ group, var.equal=TRUE) %>% extract2('p.value')
}
# average within subject, then across subject within group
group_diff <- data %>% do_aggregate(mcg ~ group + sid, mean) %>%
do_aggregate(mcg ~ group, mean) %$%
diff(mcg)
# could also do model comparison, but people may not appreciate bic_diff as much a p-value
# m0 <- lmer(mcg ~ 1 + (1|sid) + (1|stimulus), data=data, REML=FALSE)
# bic_diff <- BIC(m0, m1) %$% diff(BIC)
return(c('p'=p_val, 'diff'=group_diff))
}
power_plot <- function(Ns, powers, cols, ...) {
xpos <- c(5, 10, 25, 50, 100, 200, 400)
plot.clean(xpos, 0:1, log='x', xlab='N per group')
draw.axis(1, xpos, labels=xpos)
draw.axis(2, 0:5/5, labels=0:5*20)
abline(v=xpos, col='gray80', lwd=0.5)
abline(h=0:5/5, col='gray80', lwd=0.5)
powers %>% row_apply_ii(function(pow, ii) {
lines(Ns, pow, col=cols[ii], type='o', pch=16, cex=0.75, ...)
})
box()
}
rm(list=ls())
source('~/Dropbox/scripts/basic_plotters.R')
setwd('~/Dropbox/MultisensoryIntegration/Papers/McGurk_MTurk_Compare/IMRF_Educational_Talk/part2_mturk/')
require(parallel)
require(magrittr)
source('power_analysis.R')
cl <- makeCluster(2)
set.seed(05012017)
seq_double <- function(x, times, factor=2) cumprod(c(x, rep(factor, times)))
seq_double(6, 6)
seq_double(6, 5)
par_list <- list(
n_simulations = 50,
deltas = 1:4/10,
sample_sizes = seq_double(6, 5),
n_trials = 10,
n_stimuli = 1,
design=design.unpaired
)
clusterExport(cl, varlist = ls())
all_results <- lapply(seq_along(par_list$deltas), function(ii) {
delta <- par_list$deltas[ii]
clusterExport(cl, 'delta', envir = environment())
res <- sapply(par_list$sample_sizes, function(N) {
plist <- list(design=par_list$design, delta=delta, sample_size=N,
n_trials=par_list$n_trials, n_stimuli=par_list$n_stimuli)
results <- parReplicate(par_list$n_simulations, cl, FUN=simulate_mcgurk_study, simulation_parameters=plist)
plt05 <- results['p',] < 0.05
percent_diff_sig <- results['diff',plt05]
# how much bigger is the observed delta compared to the 'known' delta, considering only p < 0.05
magnitude_error <- abs(percent_diff_sig / delta) %>% mean
# the true effect is y2 > y1, so if y2-y1 is negative, this is a sign error
sign_error <- mean(percent_diff_sig < 0)
return(c(mean(plt05), magnitude_error, sign_error))
})
print('Done with delta = ' %&% delta)
return(res)
})
all_results %>% set_names('delta' %&% par_list$deltas %>% str_replace_all('\\.', '_'))
bs_powers <- sapply(all_results, extract, 1, seq_along(par_list$sample_sizes)) %>% t
yhat <- bs_powers %>% row_apply(function(y) {
x <- log(par_list$sample_sizes)
lm(y ~ x + I(x^2) + I(x^3)) %>% predict
}) %>% t
yhat %<>% row_apply(function(y) {
yi <- which(y > 1)[1]
if(!is.na(yi)) {
y[yi:length(y)] <- 1
}
return(y)
}) %>% t
median(abs(bs_powers-yhat))
pow_cols <- colorRampPalette(c('orange', 'purple4'))(length(par_list$deltas))
power_plot(par_list$sample_sizes, yhat, cols = pow_cols, lwd=2)
stopCluster(cl)
rm(list=ls())
source('~/Dropbox/scripts/basic_plotters.R')
setwd('~/Dropbox/MultisensoryIntegration/Papers/McGurk_MTurk_Compare/IMRF_Educational_Talk/part2_mturk/')
require(parallel)
require(magrittr)
# get all the functions
source('power_analysis.R')
# create cluster, check size
cl <- makeCluster(2)
set.seed(05012017)
seq_double <- function(x, times, factor=2) cumprod(c(x, rep(factor, times)))
par_list <- list(
n_simulations = 120,
deltas = 1:4/10,
sample_sizes = seq_double(6, 5),
n_trials = 10,
n_stimuli = 1,
design=design.unpaired
)
clusterExport(cl, varlist = ls())
#between subjects
all_results <- lapply(seq_along(par_list$deltas), function(ii) {
delta <- par_list$deltas[ii]
clusterExport(cl, 'delta', envir = environment())
res <- sapply(par_list$sample_sizes, function(N) {
plist <- list(design=par_list$design, delta=delta, sample_size=N,
n_trials=par_list$n_trials, n_stimuli=par_list$n_stimuli)
results <- parReplicate(par_list$n_simulations, cl, FUN=simulate_mcgurk_study, simulation_parameters=plist)
plt05 <- results['p',] < 0.05
percent_diff_sig <- results['diff',plt05]
# how much bigger is the observed delta compared to the 'known' delta, considering only p < 0.05
magnitude_error <- abs(percent_diff_sig / delta) %>% mean
# the true effect is y2 > y1, so if y2-y1 is negative, this is a sign error
sign_error <- mean(percent_diff_sig < 0)
return(c(mean(plt05), magnitude_error, sign_error))
})
print('Done with delta = ' %&% delta)
return(res)
})
all_results %>% set_names('delta' %&% par_list$deltas %>% str_replace_all('\\.', '_'))
#all_results %>% append()
#par(mfrow=c(1,3))
bs_powers <- sapply(all_results, extract, 1, seq_along(par_list$sample_sizes)) %>% t
yhat <- bs_powers %>% row_apply(function(y) {
x <- log(par_list$sample_sizes)
lm(y ~ x + I(x^2) + I(x^3)) %>% predict
}) %>% t
yhat %<>% row_apply(function(y) {
yi <- which(y > 1)[1]
if(!is.na(yi)) {
y[yi:length(y)] <- 1
}
return(y)
}) %>% t
median(abs(bs_powers-yhat))
pow_cols <- colorRampPalette(c('orange', 'purple4'))(length(par_list$deltas))
#as_pdf('../figures/sub_figures/power', bg = NULL, w=6, h=2.75, {
#par(mfrow=1:2, mar=c(2, 2, 1, 1))
power_plot(par_list$sample_sizes, yhat, cols = pow_cols, lwd=2)
abline(h=0.80, col='black', lwd=2)
#bs_powers %>% row_apply(function(pw) lines(par_list$sample_sizes, pw))
#power_plot(Ns, ws_powers, cols = pow_cols)
#})
as_pdf('~/Desktop/power1.pdf', bg = NULL, w=6, h=2.75, {
par(mar=c(2, 2, 1, 1))
power_plot(par_list$sample_sizes, yhat, cols = pow_cols, lwd=2)
abline(h=0.80, col='black', lwd=2)
#bs_powers %>% row_apply(function(pw) lines(par_list$sample_sizes, pw))
#power_plot(Ns, ws_powers, cols = pow_cols)
})
as_pdf('~/Desktop/power1.pdf', bg = NULL, w=3, h=2.75, {
par(mar=c(2, 2, 1, 1))
power_plot(par_list$sample_sizes, yhat, cols = pow_cols, lwd=2)
abline(h=0.80, col='black', lwd=2)
#bs_powers %>% row_apply(function(pw) lines(par_list$sample_sizes, pw))
#power_plot(Ns, ws_powers, cols = pow_cols)
})
par_list <- list(
n_simulations = 120,
deltas = 1:4/10,
sample_sizes = seq_double(6, 5),
n_trials = 10,
n_stimuli = 4,
design=design.unpaired
)
clusterExport(cl, varlist = ls())
all_results <- lapply(seq_along(par_list$deltas), function(ii) {
delta <- par_list$deltas[ii]
clusterExport(cl, 'delta', envir = environment())
res <- sapply(par_list$sample_sizes, function(N) {
plist <- list(design=par_list$design, delta=delta, sample_size=N,
n_trials=par_list$n_trials, n_stimuli=par_list$n_stimuli)
results <- parReplicate(par_list$n_simulations, cl, FUN=simulate_mcgurk_study, simulation_parameters=plist)
plt05 <- results['p',] < 0.05
percent_diff_sig <- results['diff',plt05]
# how much bigger is the observed delta compared to the 'known' delta, considering only p < 0.05
magnitude_error <- abs(percent_diff_sig / delta) %>% mean
# the true effect is y2 > y1, so if y2-y1 is negative, this is a sign error
sign_error <- mean(percent_diff_sig < 0)
return(c(mean(plt05), magnitude_error, sign_error))
})
print('Done with delta = ' %&% delta)
return(res)
})
all_results %>% set_names('delta' %&% par_list$deltas %>% str_replace_all('\\.', '_'))
bs_powers <- sapply(all_results, extract, 1, seq_along(par_list$sample_sizes)) %>% t
yhat <- bs_powers %>% row_apply(function(y) {
x <- log(par_list$sample_sizes)
lm(y ~ x + I(x^2) + I(x^3)) %>% predict
}) %>% t
yhat %<>% row_apply(function(y) {
yi <- which(y > 1)[1]
if(!is.na(yi)) {
y[yi:length(y)] <- 1
}
return(y)
}) %>% t
median(abs(bs_powers-yhat))
pow_cols <- colorRampPalette(c('orange', 'purple4'))(length(par_list$deltas))
as_pdf('~/Desktop/power4stim.pdf', bg = NULL, w=3, h=2.75, {
par(mar=c(2, 2, 1, 1))
power_plot(par_list$sample_sizes, yhat, cols = pow_cols, lwd=2)
abline(h=0.80, col='black', lwd=2)
#bs_powers %>% row_apply(function(pw) lines(par_list$sample_sizes, pw))
#power_plot(Ns, ws_powers, cols = pow_cols)
})
