# Load packages.
library(tidyverse)
library(readxl)
library(Matrix)
library(lme4)
library(lmerTest)
theme_set(theme_bw())

graphSettings <- theme(aspect.ratio = 0.618, plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5), legend.position="none")

# List vowels.
vowels = c("a", "aa", "e", "i", "ii", "o", "oo", "u", "uu", "A", "AA", "E")
longVowels = c("aa", "ii", "oo", "uu", "AA")

# Load word data.
words.df = read_xlsx("primaryStress.xlsx") %>%
  dplyr::select(word = Word,
                stress.pos = Stress)

# Load production data.
df = map_dfr(dir(pattern = "^resultsfile_.*.csv"),
             function(x) {
               read.csv(file = x, header = T, colClasses = "character")
             }) %>%
  mutate(subject = gsub("^([^_]+)_.*", "\\1", Filename)) %>%
  #mutate(subject = þÿFilename) %>%
  dplyr::select(subject, word, syl = syllable, seg = segment, dur = duration,
                syl.dur = syllableDuration, word.dur = wordDuration,
                f0 = f0_midpoint, word.f0 = meanWordPitch, F1 = F1_midpoint,
                F2 = F2_midpoint, intensity = intensity_midpoint,
                word.intensity = meanWordIntensity) %>%
  mutate(across(c("dur", "syl.dur", "word.dur", "f0", "word.f0", "F1", "F2",
                  "intensity", "word.intensity"),
                ~ if_else(grepl("undefined", .x), NA_real_, as.numeric(.x)))) %>%
  group_by(subject, word) %>%
  mutate(seg.pos = row_number()) %>%
  ungroup() %>%
  group_by(subject, word, syl) %>%
  mutate(mean.seg.pos = mean(seg.pos)) %>%
  ungroup() %>%
  group_by(subject, word) %>%
  mutate(syl.pos = dense_rank(mean.seg.pos),
         total.syls = max(syl.pos),
         final.syl = syl.pos == total.syls) %>%
  ungroup() %>%
  dplyr::select(-mean.seg.pos) %>%
  left_join(words.df, by = "word") %>%
  mutate(vowel = seg %in% vowels,
         stressed = syl.pos == stress.pos,
         longVowel = seg %in% longVowels) %>%
  group_by(subject, word, syl) %>%
  mutate(open = last(seg, order_by = seg.pos) %in% vowels) %>%
  ungroup()


genderData <- tribble(~subject, ~gender,
                     "AA", "F",
                     "AM", "M",
                     "AZ", "M",
                     "GR", "F",
                     "MA", "M",
                     "MR", "F",
                     "RA", "F",
                     "RR", "F",
                     "SA", "M",
                     "YA", "M"
                     )



# What are the correlates of stress?
####Duration
dur.primary.m = lmer(dur ~ stressed + longVowel + total.syls +
                       final.syl + open + (1 | seg) + (1 | subject),
                     data = df %>%
                       filter(!is.na(stress.pos),
                              vowel))
summary(dur.primary.m)
summary(dur.primary.m)$coefficients

dur.primary.2.m = lmer(dur ~ longVowel + total.syls +
                                final.syl + open + (1 | seg) + (1 | subject),
                              data = df %>%
                                filter(!is.na(stress.pos),
                                       vowel))

#Anova: lower AIC & BIC is better
anova(dur.primary.m, dur.primary.2.m)


#The following gives intercepts for all the random effects: which vowels are longer e.g.?
ranef(dur.primary.m)

#The following gives a histogram of the residuals: the residuals are normally distributed under log(dur), as revealed by the Shapiro test
hist(resid(dur.primary.m))
shapiro.test(resid(dur.primary.m)) #we want a non-significant p-value to demonstrate a normal distribution



model.matrix(dur.primary.m) %>%
  data.frame() %>%
  mutate(resid = resid(dur.primary.m)) %>%
  ggplot(aes(x = total.syls, y = resid)) +
  geom_point() +
  stat_smooth()




####F1
F1.primary.m = lmer(F1 ~ stressed + longVowel + total.syls +
                       final.syl + open + (1 | seg) + (1 | subject),
                     data = df %>%
                       filter(!is.na(stress.pos),
                              vowel))
summary(F1.primary.m)
summary(F1.primary.m)$coefficients



F1.primary.2.m = lmer(F1 ~ longVowel + total.syls +
                      final.syl + open + (1 | seg) + (1 | subject),
                    data = df %>%
                      filter(!is.na(stress.pos),
                             vowel))



#Anova: lower AIC & BIC is better
anova(F1.primary.m, F1.primary.2.m)


#The following gives intercepts for all the random effects: which vowels are longer e.g.?
ranef(F1.primary.m)

#The following gives a histogram of the residuals: the residuals are normally distributed under log(dur), as revealed by the Shapiro test
hist(resid(F1.primary.m))
shapiro.test(resid(F1.primary.m)) #we want a non-significant p-value to demonstrate a normal distribution

model.matrix(F1.primary.m) %>%
  data.frame() %>%
  mutate(resid = resid(F1.primary.m)) %>%
  ggplot(aes(x = total.syls, y = resid)) +
  geom_point() +
  stat_smooth() 


###F2


F2.primary.m = lmer(F2 ~ stressed + longVowel + total.syls +
                      final.syl + open + (1 | seg) + (1 | subject),
                    data = df %>%
                      filter(!is.na(stress.pos),
                             vowel))
summary(F2.primary.m)
summary(F2.primary.m)$coefficients



F2.primary.2.m = lmer(F2 ~ longVowel + total.syls +
                        final.syl + open + (1 | seg) + (1 | subject),
                      data = df %>%
                        filter(!is.na(stress.pos),
                               vowel))



#Anova: lower AIC & BIC is better
anova(F2.primary.m, F2.primary.2.m)


#The following gives intercepts for all the random effects: which vowels are longer e.g.?
ranef(F2.primary.m)

#The following gives a histogram of the residuals: the residuals are normally distributed under log(dur), as revealed by the Shapiro test
hist(resid(F2.primary.m))
shapiro.test(resid(F2.primary.m)) #we want a non-significant p-value to demonstrate a normal distribution

model.matrix(F2.primary.m) %>%
  data.frame() %>%
  mutate(resid = resid(F2.primary.m)) %>%
  ggplot(aes(x = total.syls, y = resid)) +
  geom_point() +
  stat_smooth() 





###Intensity
intensity.primary.m = lmer(intensity ~ stressed + longVowel + word.intensity +
                             final.syl + open + (1 | seg) + (1 | subject),
                           data = df %>%
                             filter(!is.na(stress.pos),
                                    vowel))
intensity.primary.2.m = lmer(intensity ~ longVowel + word.intensity + final.syl +
                               open + (1 | seg) + (1 | subject),
                             data = df %>%
                               filter(!is.na(stress.pos),
                                      vowel))
anova(intensity.primary.m, intensity.primary.2.m)
summary(intensity.primary.m)
summary(intensity.primary.m)$coefficients

#The following gives intercepts for all the random effects: which vowels are more intense e.g.?
ranef(intensity.primary.m)

#The following gives a histogram of the residuals
hist(resid(intensity.primary.m))
shapiro.test(resid(intensity.primary.m))

model.matrix(intensity.primary.m) %>%
  data.frame() %>%
  mutate(resid = resid(intensity.primary.m)) %>%
  ggplot(aes(x = word.intensity, y = resid)) +
  geom_point() +
  stat_smooth()



#Intensity: stressed & unstressed Vs, long vs. short Vs
intensity.primary.length.all.m = lmer(intensity ~ longVowel + word.intensity +
                                    final.syl + open + (1 | seg) + (1 | subject),
                                  data = df %>%
                                    filter(!is.na(stress.pos),
                                           vowel))
intensity.primary.length.all.2.m = lmer(intensity ~ word.intensity +
                                      final.syl + open + (1 | seg) + (1 | subject),
                                    data = df %>%
                                      filter(!is.na(stress.pos),
                                             vowel))
summary(intensity.primary.length.all.m)
anova(intensity.primary.length.all.m, intensity.primary.length.all.2.m)







#Intensity: stressed Vs only, long vs. short Vs
intensity.primary.length.m = lmer(intensity ~ longVowel + word.intensity +
                             final.syl + open + (1 | seg) + (1 | subject),
                           data = df %>%
                             filter(!is.na(stress.pos),
                                    vowel,
                                    stressed))
intensity.primary.length.2.m = lmer(intensity ~ word.intensity +
                                    final.syl + open + (1 | seg) + (1 | subject),
                                  data = df %>%
                                    filter(!is.na(stress.pos),
                                           vowel,
                                           stressed))
summary(intensity.primary.length.m)
anova(intensity.primary.length.m, intensity.primary.length.2.m)






#F0
f0.df = df %>%
  filter(!is.na(stress.pos),
         vowel,
         !is.na(word.f0),
         !is.na(open),
         !is.na(f0)) %>% 
  left_join(genderData, by = "subject")
f0.primary.m.initial = lmer(f0 ~ stressed + longVowel + word.f0 + open + final.syl + 
                      (1 | seg) + (1 | subject),
                    data = f0.df)
f0.primary.2.m.initial = lmer(f0 ~ longVowel + word.f0 + open + final.syl + 
                      (1 | seg) + (1 | subject),
                    data = f0.df)
anova(f0.primary.m.initial, f0.primary.2.m.initial)
summary(f0.primary.m.initial)
summary(f0.primary.m.initial)$coefficients


#The following gives intercepts for all the random effects: which vowels are more intense e.g.?
ranef(f0.primary.m.initial)

#The following gives a histogram of the residuals; the stuff in [] removes outliers where the residual is greater than 50
hist(resid(f0.primary.m.initial)[abs(resid(f0.primary.m.initial)) < 50])
shapiro.test(resid(f0.primary.m.initial))

model.matrix(f0.primary.m.initial) %>%
  data.frame() %>%
  mutate(resid = resid(f0.primary.m.initial)) %>%
  ggplot(aes(x = word.f0, y = resid)) +
  geom_point() +
  stat_smooth()

#The preceding models have big outliers.  Find those outliers, remove them, run model again.
which(abs(resid(f0.primary.m.initial)) > 50)
f0.df.truncated <- f0.df[abs(resid(f0.primary.m.initial)) <= 50,]


f0.primary.m.truncated = lmer(f0 ~ stressed + longVowel + word.f0 + open + final.syl +
                                (1 | seg) + (1 | subject),
                              data = f0.df.truncated)
f0.primary.2.m.truncated = lmer(f0 ~ longVowel + word.f0 + open + final.syl +
                                  (1 | seg) + (1 | subject),
                                data = f0.df.truncated)
anova(f0.primary.m.truncated, f0.primary.2.m.truncated)
summary(f0.primary.m.truncated)
summary(f0.primary.m.truncated)$coefficients


#The following gives intercepts for all the random effects: which vowels are more intense e.g.?
ranef(f0.primary.m.truncated)

#The following gives a histogram of the residuals
hist(resid(f0.primary.m.truncated))
shapiro.test(resid(f0.primary.m.truncated))

model.matrix(f0.primary.m.truncated) %>%
  data.frame() %>%
  mutate(resid = resid(f0.primary.m.truncated)) %>%
  ggplot(aes(x = word.f0, y = resid)) +
  geom_point() +
  stat_smooth()










df %>% 
  filter(vowel == TRUE,
         word < "buXalaa") %>% 
  ggplot(aes(x = syl.pos, y = f0, color = stressed)) +
  geom_point() +
  facet_wrap(~ word, ncol = 5)

wordList <- df %>%
  select(word) %>%
  unique()

print(wordList, n=Inf)


##Plots

filter(df, is.na(stressed) & vowel)


#scatterplot: duration
df %>%
  filter(vowel == TRUE,
         longVowel == FALSE) %>%
  ggplot(aes(x = syl.pos, y = syl.dur, color = stressed)) +
  geom_point() +
  facet_wrap(~ subject, ncol = 6)

#box plot: duration
df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = dur)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Duration") +
  xlab("Syllable Type") +
  ylab("Vowel Duration (ms)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
  #facet_wrap(~seg)


ggsave("duration_highres.png", width = 6, units = "in")
  
##Outliers for the above graph

p = df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = dur)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Duration") +
  xlab("Syllable Type") +
  ylab("Vowel Duration (ms)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)

p.build = ggplot_build(p)

p.build$data[[1]]$outliers

##


df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = dur, color = longVowel)) +
  scale_color_grey()+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Duration") +
  xlab("Syllable Type") +
  ylab("Vowel Duration (ms)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)

df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = dur*1000, fill = longVowel)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short Duration") +
  xlab("Syllable Type") +
  ylab("Vowel Duration (ms)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom") +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")
#facet_wrap(~seg)

ggsave("durationLongvsShort_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = dur*1000, fill = longVowel)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short Duration") +
  xlab("Syllable Type") +
  ylab("Vowel Duration (ms)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom") +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")
#facet_wrap(~seg)

p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##

#number of vowels in the above plot:
df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE) %>% 
  group_by(longVowel) %>% 
  summarize(n = n())


df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(mean = mean(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(max = max(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(min = min(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed, longVowel)%>%
  summarize(mean = mean(dur))


df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed)%>%
  summarize(mean = mean(dur))


df %>%
  filter(vowel == TRUE) %>% 
  group_by(seg, stressed) %>% 
  summarize(n = n())

#box plot: F1

df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = F1)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: F1") +
  xlab("Syllable Type") +
  ylab("F1 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)


ggsave("F1_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = F1)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: F1") +
  xlab("Syllable Type") +
  ylab("F1 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)
  
p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##

df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = F1, fill = longVowel)) +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short F1") +
  xlab("Syllable Type") +
  ylab("F1 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)

ggsave("F1LongvsShort_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = F1, fill = longVowel)) +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short F1") +
  xlab("Syllable Type") +
  ylab("F1 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##

#number of vowels in the above plot:
df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE) %>% 
  group_by(longVowel) %>% 
  summarize(n = n())


df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(mean = mean(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(max = max(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(min = min(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed, longVowel)%>%
  summarize(mean = mean(F1))


df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed)%>%
  summarize(mean = mean(F1))


df %>%
  filter(vowel == TRUE) %>% 
  group_by(seg, stressed) %>% 
  summarize(n = n())

#box plot: F2

df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = F2)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: F2") +
  xlab("Syllable Type") +
  ylab("F2 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)


ggsave("F2_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = F2)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: F2") +
  xlab("Syllable Type") +
  ylab("F2 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##

df %>%
  filter(vowel == TRUE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = F2, fill = longVowel)) +
  scale_fill_grey(start = 0.8, end = 0.4)+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short F2") +
  xlab("Syllable Type") +
  ylab("F1 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)

ggsave("F2LongvsShort.png")




#number of vowels in the above plot:
df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE) %>% 
  group_by(longVowel) %>% 
  summarize(n = n())


df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(mean = mean(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(max = max(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(min = min(n))

df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed, longVowel)%>%
  summarize(mean = mean(F2))


df %>%
  filter(vowel == TRUE) %>% 
  group_by(stressed)%>%
  summarize(mean = mean(F2))


df %>%
  filter(vowel == TRUE) %>% 
  group_by(seg, stressed) %>% 
  summarize(n = n())


#box plot: intensity
df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE,
         !is.na(intensity)) %>% 
         #longVowel == FALSE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = intensityProp)) +
  geom_boxplot() +
  #facet_grid(seg ~ subject) +
  ggtitle("Stressed and Unstressed Vowels: Midpoint Intensity", subtitle = "As a Proportion of Word Intensity") +
  xlab("Syllable Type") +
  ylab("Proportional Intensity (dB)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings

ggsave("intensity_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE,
         !is.na(intensity)) %>% 
  #longVowel == FALSE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = intensityProp)) +
  geom_boxplot() +
  #facet_grid(seg ~ subject) +
  ggtitle("Stressed and Unstressed Vowels: Midpoint Intensity", subtitle = "As a Proportion of Word Intensity") +
  xlab("Syllable Type") +
  ylab("Proportional Intensity (dB)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##


df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE,
         !is.na(intensity)) %>% 
  #longVowel == FALSE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = intensityProp, fill = longVowel)) +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")+
  geom_boxplot() +
  #facet_grid(seg ~ subject) +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short Midpoint Intensity", subtitle = "As a Proportion of Word Intensity") +
  xlab("Syllable Type") +
  ylab("Proportional Intensity (dB)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)

ggsave("intensityLongvsShort_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE,
         !is.na(intensity)) %>% 
  #longVowel == FALSE) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = intensityProp, fill = longVowel)) +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")+
  geom_boxplot() +
  #facet_grid(seg ~ subject) +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short Midpoint Intensity", subtitle = "As a Proportion of Word Intensity") +
  xlab("Syllable Type") +
  ylab("Proportional Intensity (dB)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings +
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##


#number of vowels in the above plot:
df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(stressed) %>% 
  summarize(n = n())

df %>% 
  filter(vowel == TRUE,
         is.na(intensity)) %>% 
  summarize(n = n())
  

df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(longVowel) %>% 
  summarize(n = n())

df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n())

df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  summarize(mean = mean(n))

df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  summarize(max = max(n))

df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  summarize(min = min(n))

df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  arrange(n)

df %>%
  mutate(intensityProp = intensity/word.intensity) %>% 
  filter(vowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  summarize(mean = mean(n))

df %>%
  filter(vowel == TRUE,
         longVowel == TRUE, 
         !is.na(intensity)) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE,
  longVowel == TRUE, 
  !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  summarize(mean = mean(n))

df %>%
  filter(vowel == TRUE,
         longVowel == TRUE, 
         !is.na(intensity)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  arrange(n)

  

df %>%
  filter(vowel == TRUE,
         !is.na(intensity)) %>%
  group_by(stressed, longVowel)%>%
  summarize(mean = mean(intensity))


df %>%
  filter(vowel == TRUE,
         !is.na(intensity)) %>%
  group_by(stressed)%>%
  summarize(mean = mean(intensity))

#box plot: F0, with word F0 subtracted out

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0diff)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: F0", subtitle = "Midpoint F0 Minus Mean Word F0") +
  xlab("Syllable Type") +
  ylab("F0 Difference (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings

ggsave("f0_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0diff)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: F0", subtitle = "Midpoint F0 Minus Mean Word F0") +
  xlab("Syllable Type") +
  ylab("F0 Difference (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0diff, fill = longVowel)) +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short F0", subtitle = "Midpoint F0 Minus Mean Word F0") +
  xlab("Syllable Type") +
  ylab("F0 Difference (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings+
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)

ggsave("f0LongvsShort_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0diff, fill = longVowel)) +
  scale_fill_grey(start = 0.8, end = 0.4,
                  labels = c("Short", "Long" ),
                  name = "Vowel Length")+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Long vs Short F0", subtitle = "Midpoint F0 Minus Mean Word F0") +
  xlab("Syllable Type") +
  ylab("F0 Difference (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings+
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##


f0.df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0diff, fill = gender)) +
  scale_fill_grey(start = 0.8, end = 0.4)+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: F vs M F0", subtitle = "Midpoint F0 Minus Mean Word F0") +
  xlab("Syllable Type") +
  ylab("F0 Difference (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings+
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)


f0.df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0, fill = gender)) +
  scale_fill_grey(start = 0.8, end = 0.4)+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Female vs Male F0", subtitle = "Midpoint F0") +
  xlab("Syllable Type") +
  ylab("F0 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings+
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)

ggsave("f0genderdifference_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = f0.df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0, fill = gender)) +
  scale_fill_grey(start = 0.8, end = 0.4)+
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Vowels: Female vs Male F0", subtitle = "Midpoint F0") +
  xlab("Syllable Type") +
  ylab("F0 (Hz)") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings+
  theme(legend.position = "bottom")# +
#facet_wrap(~seg)
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0),
         longVowel) %>% 
  group_by(subject) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  summarize(mean = mean(n))

df %>% 
  filter(vowel == TRUE,
         is.na(f0)) %>% 
  summarize(n = n())

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  group_by(subject) %>% 
  summarize(n = n()) 
  
df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(mean = mean(n))

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(max = max(n))

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  group_by(subject) %>% 
  summarize(n = n())%>%
  summarize(min = min(n))

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  group_by(stressed) %>% 
  summarize(n = n())

df %>%
  mutate(f0diff = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>% 
  group_by(longVowel) %>% 
  summarize(n = n())

df %>%
  filter(vowel == TRUE,
         !is.na(f0)) %>%
  group_by(stressed, longVowel)%>%
  summarize(mean = mean(f0))

f0.df %>%
  group_by(gender, subject) %>%
  summarize(meanF0 = mean(f0)) %>%
  arrange(meanF0)

f0.df %>%
  filter(stressed)%>%
  group_by(gender, subject) %>%
  summarize(meanF0 = mean(f0)) %>%
  arrange(meanF0)


df %>%
  filter(vowel == TRUE,
         !is.na(f0)) %>%
  group_by(stressed)%>%
  summarize(mean = mean(f0))
  

#box plot: F0 as a proportion of word F0
df %>%
  mutate(f0prop = f0-word.f0) %>% 
  filter(vowel == TRUE,
         !is.na(f0)) %>%
  ggplot(aes(x = fct_rev(as.factor(stressed)), y = f0prop)) +
  geom_boxplot() +
  ggtitle("Stressed and Unstressed Syllables: F0") +
  xlab("Syllable Type") +
  ylab("F0 Proportion") +
  scale_x_discrete(labels = c("Stressed", "Unstressed")) +
  graphSettings
  
#box plot: F0 stuff
df %>%
  filter(vowel) %>%
  group_by(subject, word) %>%
  arrange(syl.pos) %>%
  mutate(rel.pos = syl.pos - stress.pos) %>%
  filter(!longVowel,
         abs(rel.pos) <= 1) %>%
  mutate(prop.f0 = f0 / first(na.omit(f0))) %>%
  ungroup() %>%
  ggplot(aes(x = as.factor(rel.pos), y = prop.f0, group = word)) +
  geom_line(alpha = 0.2) +
  facet_wrap(~ subject)



#Exploratory work

#How many open syllables with short Vs are stressed?
df %>% 
  filter(stressed,
         vowel,
         longVowel == FALSE)

#How many open syllables with stressed Vs?
df %>% 
  filter(stressed,
         vowel)


#How many long Vs are unstressed?
df %>% 
  filter(longVowel) %>% 
  select(word, seg, syl, stressed) %>% 
  arrange(desc(stressed))

#Get words with multiple long Vs
df %>% 
  group_by(subject, word) %>% 
  filter(longVowel) %>% 
  summarize(longVs = n()) %>% 
  filter(longVs > 1)

df %>% 
  select(word) %>% 
  distinct() %>% 
  print(n=Inf) %>% 
  write_delim("uniqueWords.txt")

#Which Vs are NA for stress?
df %>% 
  filter(vowel, is.na(stressed)) %>% 
  select(word) %>% 
  distinct() %>% 
  left_join(words.df, by = "word") %>% 
  print(n=Inf)
