library(tidyverse)
library(MASS)




#ggplot Setup

theme_set(theme_bw() + 
            theme(plot.title = element_text(hjust = 0.5), 
                  plot.subtitle = element_text(hjust = 0.5), 
                  #fig.width = 6, 
                  aspect.ratio = 0.618)
)


#Set up the data
df <- read_csv("FrequencyData2022.csv")
wordInfo <- read_csv("Frequency Project Opt processes.csv") %>% 
  rename("process" = "Opt. Process", 
         "varID" = "Variant #"
         )

df2023 <- read_csv("FrequencyData2023.csv")
wordInfo2023 <- readxl::read_excel("Frequency data process variant categorization2023.xlsx") %>% 
  rename("process" = "Process", 
         "varID" = "Variant #"
  )


#colnames(df)

df <- df %>% 
  pivot_longer(alDawah:tistanzifk, names_to = "word", values_to = "rating") %>% 
  left_join(wordInfo, by = c("word" = "Variant")) %>% 
  rename("Buraydah" = "Born and raised in Buraydah",
         "education" = "Level of Education",
         "homeLang" = "variety used at home",
         "hearingProbs" = "hearing/ speaking problems",
         "outside" = "lived outside Burayday more than 6 months",
         "otherLangs" = "speak other languages",
         "CAstart" = "Exposure to CA start date",
         "SAstart" = "Exposure to SA start date",
         "friends" = "variety used with friends"
  )


df2023 <- df2023 %>% 
  pivot_longer(akalEthum:kartinwah, names_to = "word", values_to = "rating") %>% 
  left_join(wordInfo2023, by = c("word" = "Variant")) %>% 
  rename("Buraydah" = "Born and raised in Buraydah",
         "education" = "Level of Education",
         "homeLang" = "variety used at home",
         "hearingProbs" = "hearing/ speaking problems",
         "outside" = "lived outside Burayday more than 6 months",
         "otherLangs" = "speak other languages",
         "CAstart" = "Exposure to CA start date",
         "SAstart" = "Exposure to SA start date",
         "friends" = "variety used with friends"
  )

df <- full_join(df, df2023)

##################Exploratory analysis
df %>% 
  group_by(process, varID) %>% 
  summarize(meanRating = mean(rating)) %>% 
  ggplot(aes(x=varID, y=meanRating, color=process)) +
  geom_point()


ggplot(df, aes(x=varID, y=rating, color=process)) +
  geom_boxplot() +
  facet_wrap(vars(process), scales = "free")

#Let's look at one process in particular:
df %>%
  filter(process == "CplxFCVCCFCVC") %>% 
  group_by(varID) %>% 
  summarize(meanRating = mean(rating))

df %>%
  filter(varID == "CplxFCVCC") %>% 
  group_by(rating) %>% 
  summarize(count = n())

df %>%
  filter(varID == "CplxFCVCC") %>% 
  group_by(Gender, rating) %>% 
  summarize(count = n())

df %>%
  filter(process == "CplxFCVCCFCVC") %>% 
  ggplot(aes(x=Gender, y=rating,color=process)) +
  geom_boxplot() +
  facet_wrap(vars(varID), scales = "free")


#A different process:
df %>%
  filter(varID == "FCVCC") %>% 
  group_by(rating) %>% 
  summarize(count = n())



#################

vm.df = df %>% filter(process == "VM") %>% mutate(rating = as.factor(rating))
ordered.logistic.fit.vm = polr(rating ~ varID,
                            data = vm.df)

coef(summary(ordered.logistic.fit.vm))

breaks <- c(-1.81411295, -1.06395889, -0.75738872, -0.37593062, -0.07016727, 0.23374628)

data.frame(z = seq(-5, 5, 0.01)) %>%
  mutate(d = dlogis(z - 1)) %>%
  ggplot(aes(x = z,
             y = d)) +
  geom_area(stat = "function", fun = function(x) { dlogis(x - 1) },
            fill = "gray", xlim = c(-0.5, 2)) +
  geom_line() +
  geom_vline(xintercept = breaks) +
  annotate("label", x = 0.7, y = 0.1,
           label = eval(bquote(expression(P(y[i] == B) == .(round(plogis(2) - plogis(-0.5), 2)))))) +
  scale_x_continuous(expression(z[i] == alpha + beta*x[i]),
                     breaks = breaks,
                     labels = c("1|2", "2|3", "3|4", "4|5", "5|6", "6|7")) +
  scale_y_continuous("") +
  theme(panel.grid.minor.x = element_blank())




#################
df %>% 
  filter(process == "MidVEpen", Age >= 40) %>% 
  mutate(Age = Age > 60) %>% 
  ggplot(aes(x=varID, y=rating, color=Age)) +
  geom_boxplot()

df %>% 
  filter(process == "MidVEpen", Age < 60) %>%
    mutate(Age = Age > 40) %>% 
  ggplot(aes(x=varID, y=rating, color=Age)) +
  geom_boxplot()

fem.df = df %>% filter(process == "MidVEpen") %>% mutate(rating = as.factor(rating))
ordered.logistic.fit.fem = polr(rating ~ varID + Young + varID:Young,
                            data = fem.df)

coef(summary(ordered.logistic.fit.fem))



################### Metathesis

meta.df <- df %>% 
  filter(process == "VM") %>% 
  mutate(rating = as.factor(rating))

ordered.logistic.fit.meta = polr(rating ~ varID + Young + varID:Young,
                            data = meta.df %>%
                              mutate(Young = Age < 65))
coef(summary(ordered.logistic.fit.meta))




################### Gemination

meta.df <- df %>% 
  filter(process == "Gem") %>% 
  mutate(rating = as.factor(rating))

ordered.logistic.fit.gem = polr(rating ~ varID + Young + varID:Young,
                                 data = meta.df %>%
                                   mutate(Young = Age < 65))
coef(summary(ordered.logistic.fit.gem))




#########
######### Things for dissertation
#########



#A different process:
df %>% 
  filter(process == "OldYoung") %>% 
  ggplot(aes(x=Age, y=rating, color=varID)) +
  geom_point() +
  geom_smooth()


df %>% 
  filter(process == "OldYoung") %>% 
  ggplot(aes(x=Age, y=rating, color=varID)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(vars(WordID))


df %>% 
  filter(process == "OldYoung") %>%
  mutate(Age = Age > 65) %>% 
  ggplot(aes(x=varID, y=rating, fill=Age)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Monomorphemic Triconsonantal Clusters"
  ) +
  scale_fill_manual("Age",
                    labels = c("65 or younger", ">65"),
                    values = c("white", "gray50")
  ) +
  scale_x_discrete(labels=c("Old" = "CVCC", "Young" = "CCVC"))

ggsave("CCC_boxplot_by_age_highres.png", width = 6, units = "in")


##Outliers for the above graph

p = df %>% 
  filter(process == "OldYoung") %>%
  mutate(Age = Age > 65) %>% 
  ggplot(aes(x=varID, y=rating, fill=Age)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Monomorphemic Triconsonantal Clusters"
  ) +
  scale_fill_manual("Age",
                    labels = c("65 or younger", ">65"),
                    values = c("white", "gray50")
  ) +
  scale_x_discrete(labels=c("Old" = "CVCC", "Young" = "CCVC"))
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##






####Stats
#https://kaplanas.github.io/nonstandard-regression/nonstandard_regression.html

old.df = df %>% 
  filter(process == "OldYoung") %>% 
  mutate(rating = as.factor(rating)) %>% 
  mutate(varID = replace(varID, 
                         varID == "Young", "CCVC")) %>%
  mutate(varID = replace(varID, 
                         varID == "Old", "CVCC"))
ordered.logistic.fit = polr(rating ~ varID + Young + varID:Young,
                            data = old.df %>%
                              mutate(Young = Age < 65))

coef(summary(ordered.logistic.fit))


#Is the above consistent with this:
df %>% filter(process == "OldYoung") %>% group_by(varID) %>% summarize(mean = mean(rating))

df %>% filter(process == "OldYoung") %>%  mutate(AgeGroup = Age > 65) %>% group_by(AgeGroup) %>% summarize(mean = mean(rating))

df %>% filter(process == "OldYoung") %>% mutate(AgeGroup = Age > 65) %>% group_by(varID, AgeGroup) %>% summarize(mean = mean(rating))

df %>% filter(process == "OldYoung") %>% mutate(AgeGroup = Age > 65) %>% group_by(varID, AgeGroup) %>% summarize(median = median(rating))




## CCC: one morphological boundary

df %>%
  filter(process == "OneMB") %>% 
  group_by(varID) %>% 
  summarize(meanRating = mean(rating), medianRating = median(rating))


df %>% 
  filter(process == "OneMB") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Bimorphemic Triconsonantal Clusters"
  ) +
  scale_x_discrete(labels=c("Bad" = "CVC-C", "Good" = "CC-VC"))

ggsave("CCC_boxplot_OneMorphBoundary_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>% 
  filter(process == "OneMB") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Bimorphemic Triconsonantal Clusters"
  ) +
  scale_x_discrete(labels=c("Bad" = "CVC-C", "Good" = "CC-VC"))
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##

## CCC: two morphological boundaries

df %>%
  filter(process == "TwoMB") %>% 
  group_by(varID) %>% 
  summarize(meanRating = mean(rating), medianRating = median(rating))


df %>% 
  filter(process == "OneMB") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Trimorphemic Triconsonantal Clusters"
  ) +
  scale_x_discrete(labels=c("Bad" = "CV-C-C", "Good" = "C-C-VC"))

ggsave("CCC_boxplot_TwoMorphBoundaries_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>% 
  filter(process == "OneMB") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Trimorphemic Triconsonantal Clusters"
  ) +
  scale_x_discrete(labels=c("Bad" = "CV-C-C", "Good" = "C-C-VC"))
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##


## Glide epenthesis: is it acceptable?

df %>%
  filter(process == "I/you (m) opened" | process == "I/you (m) put") %>% 
  group_by(varID) %>% 
  summarize(meanRating = mean(rating), medianRating = median(rating))


df %>% 
  filter(process == "I/you (m) opened" | process == "I/you (m) put") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Epenthetic form with special subject suffix",
       y = "Rating",
       title = "Glide Epenthesis Ratings"
  ) +
  scale_x_discrete(labels=c("Good" = "CCVGC"))

ggsave("boxplot_SpecialSubjectSuffix_highres.png", width = 6, units = "in")


##Outliers for the above graph

p = df %>% 
  filter(process == "I/you (m) opened" | process == "I/you (m) put") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Epenthetic form with special subject suffix",
       y = "Rating",
       title = "Glide Epenthesis Ratings"
  ) +
  scale_x_discrete(labels=c("Good" = "CCVGC"))
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##


## CCC: two morphological boundaries

df %>%
  filter(process == "GlideEpen") %>% 
  group_by(varID) %>% 
  summarize(meanRating = mean(rating), medianRating = median(rating))


df %>% 
  filter(process == "GlideEpen") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Glide Epenthesis with Special Subject Suffix"
  ) +
  scale_x_discrete(labels=c("Bad" = "GCC", "Good" = "GCVC"))

ggsave("boxplot_GlideEpenthesis_highres.png", width = 6, units = "in")


##Outliers for the above graph

p = df %>% 
  filter(process == "GlideEpen") %>%
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Glide Epenthesis with Special Subject Suffix"
  ) +
  scale_x_discrete(labels=c("Bad" = "GCC", "Good" = "GCVC"))
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##


## Duplexes

df %>%
  filter(process == "you (f.s) opened it/him"|
           process == "you (m.p) opened it/him"|
           process == "open (f.s) it/him"|
           process == "open (m.p) it/him"|
           process == "they (m.p) opened it/him") %>% 
  group_by(varID) %>% 
  summarize(meanRating = mean(rating), medianRating = median(rating))


df %>% 
  filter(process == "you (f.s) opened it/him"|
           process == "you (m.p) opened it/him"|
           process == "open (f.s) it/him"|
           process == "open (m.p) it/him"|
           process == "they (m.p) opened it/him"
         ) %>%
  ggplot(aes(x=WordID, y=rating)) +
  geom_boxplot() +
  labs(x = "Word",
       y = "Rating",
       title = "Duplexes"
  )  +
  scale_x_discrete(labels=c("fakkejtiih" = "fakkejtiih", 
                            "fakkejtuuh" = "fakkejtuuh",
                            "Fikkiih" = "fikkiih",
                            "Fikkuuh" = "fikkuuh",
                            "Fakkooh" = "fakkooh"
                            ))

ggsave("boxplot_Duplexes_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>% 
  filter(process == "you (f.s) opened it/him"|
           process == "you (m.p) opened it/him"|
           process == "open (f.s) it/him"|
           process == "open (m.p) it/him"|
           process == "they (m.p) opened it/him"
  ) %>%
  ggplot(aes(x=WordID, y=rating)) +
  geom_boxplot() +
  labs(x = "Word",
       y = "Rating",
       title = "Duplexes"
  )  +
  scale_x_discrete(labels=c("fakkejtiih" = "fakkejtiih", 
                            "fakkejtuuh" = "fakkejtuuh",
                            "Fikkiih" = "fikkiih",
                            "Fikkuuh" = "fikkuuh",
                            "Fakkooh" = "fakkooh"
  ))
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##



## Pronominal Optional Processes

df %>%
  filter(process == "1A"|
           process == "1B"|
           process == "1C"|
           process == "1D"|
           process == "1E") %>% 
  group_by(process, varID) %>% 
  summarize(meanRating = mean(rating), medianRating = median(rating))


useful.labs <- c("(1a)", "(1b)", "(1c)", "(1d)", "(1e)")

df %>% 
  filter(process == "1A"|
           process == "1B"|
           process == "1C"|
           process == "1D"|
           process == "1E"
  ) %>%
  mutate(process = replace(process,
                           process == "1A", "(1a)"),
         process = replace(process,
                           process == "1B", "(1b)"),
         process = replace(process,
                           process == "1C", "(1c)"),
         process = replace(process,
                           process == "1D", "(1d)"),
         process = replace(process,
                           process == "1E", "(1e)")
  ) %>% 
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Ratings for Pronominal Optional Processes"
  ) +
  facet_wrap(vars(process), scales = "free") +
  scale_x_discrete(labels=c("Faithful" = "Faithful", 
                            "Geminated" = "Gemination",
                            "Vdeletion" = "V-Del."
  ))

ggsave("boxplot_Pronominal_highres.png", width = 6, units = "in")

##Outliers for the above graph

p = df %>% 
  filter(process == "1A"|
           process == "1B"|
           process == "1C"|
           process == "1D"|
           process == "1E"
  ) %>%
  mutate(process = replace(process,
                           process == "1A", "(1a)"),
         process = replace(process,
                           process == "1B", "(1b)"),
         process = replace(process,
                           process == "1C", "(1c)"),
         process = replace(process,
                           process == "1D", "(1d)"),
         process = replace(process,
                           process == "1E", "(1e)")
  ) %>% 
  ggplot(aes(x=varID, y=rating)) +
  geom_boxplot() +
  labs(x = "Variant",
       y = "Rating",
       title = "Ratings for Pronominal Optional Processes"
  ) +
  facet_wrap(vars(process), scales = "free") +
  scale_x_discrete(labels=c("Faithful" = "Faithful", 
                            "Geminated" = "Gemination",
                            "Vdeletion" = "V-Del."
  ))
  
  p.build = ggplot_build(p)

p.build$data[[1]]$outliers
##