## Generated on: November 23, 2015 - 15:08:23
# load required packages
library(lme4)
library(Hmisc)
library(ggplot2)
# custom plotting functions
download.file('http://www.let.rug.nl/wieling/uhum/plotting.R','plotting.R')
source('plotting.R')
# version information
R.version.string
## [1] "R version 3.2.2 (2015-08-14)"
packageVersion('lme4')
## [1] '1.1.9'
packageVersion('Hmisc')
## [1] '3.17.0'
packageVersion('ggplot2')
## [1] '1.0.1'
For each dataset, we first visualize the relationship between UM and UH by dividing the speakers in four (about) equally sized groups on the basis of their year of birth (not for the U.S. Twitter data, as age information is not available). We further subdivide these four groups on the basis of the gender of the speakers. We then visualize the relationship between age and gender on the use of UM (as proportion of the total use of UH and UM), and if present, the relationship between year of recording and gender on the use of UM. If the number of words per speaker is available in the dataset, we also separately visualize the relationship between age and gender and the use of UM (as proportion of all words pronounced by the speaker) and UH (as proportion of all words pronounced by the speaker). In that case, we also visualize the relationship between age and gender and the use of any of the two hesitation markers (as proportion of all words pronounced by the speaker).
In the second step, we fit a logistic mixed-effects regression model predicting the occurrence of UM as opposed to UH. Consequently, in interpreting the model summaries below, positive estimates indicate an increased probability of using UM as opposed to UH, while negative estimates indicate the opposite. For each model the optimal random- and fixed-effects structure was determined via AIC comparisons. Only the final model for each data set is shown.
Finally, for the datasets for which the number of words per speaker was available, we fitted the following three logistic mixed-effects regression models: (1) predicting the occurrence of UM (as proportion of the total number of words), (2) predicting the occurrence of UH (as proportion of the total number of words), and (3) predicting the occurrecne of either UH or UM (as proportion of the total number of words). For the U.S. Twitter dataset we also show the geographical pattern.
download.file('http://www.let.rug.nl/wieling/uhum/English/Switchboard/sb.rda','sb.rda')
load('sb.rda')
dim(sb)
## [1] 91001 35
str(sb)
## 'data.frame': 91001 obs. of 35 variables:
## $ CallerID : Factor w/ 520 levels "1000","1001",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 37 37 37 37 37 37 37 37 37 37 ...
## $ Sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ Call : Factor w/ 4844 levels "2001a","2001b",..: 231 25 95 301 63 231 335 131 25 95 ...
## $ CallID : Factor w/ 2438 levels "2001","2005",..: 116 13 48 151 32 116 168 66 13 48 ...
## $ WordCount : int 748 561 397 491 400 748 673 477 561 397 ...
## $ CallPos : int 327 11 250 601 5 581 361 281 412 211 ...
## $ PhrasePos : int 1 2 2 2 1 1 1 1 1 2 ...
## $ BPhrasePos : int 2 1 8 1 10 5 1 1 7 2 ...
## $ PLength : int 2 2 9 2 10 5 1 1 7 3 ...
## $ FirstInPhrase: num 1 0 0 0 1 1 1 1 1 0 ...
## $ LastInPhrase : num 0 1 0 1 0 0 1 1 0 0 ...
## $ Duration : num 0.493 0.433 0.316 0.28 0.318 ...
## $ Word : Factor w/ 2 levels "uh","um": 2 2 1 1 2 1 2 2 1 1 ...
## $ Calls : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Region : Factor w/ 9 levels "MIXED","NEW_ENGLAND",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ NextDuration : num 0.0924 0.4256 0.9936 0.5747 0.08 ...
## $ NextWord : Factor w/ 5404 levels "-[US]SR","-[company]'s",..: 1129 1130 5382 1130 2898 2898 1130 1130 5265 3610 ...
## $ PrevDuration : num 0.46 0.309 0.292 0.26 0.388 ...
## $ PrevWord : Factor w/ 3578 levels "-[a]head","-[a]nd",..: 496 607 607 833 496 496 496 496 496 833 ...
## $ PrevSilence : num 0.46 0 0 0 0.388 ...
## $ NextSilence : num 0 0.426 0 0.575 0 ...
## $ UM : num 1 1 0 0 1 0 1 1 0 0 ...
## $ nWords : int 5236 5236 5236 5236 5236 5236 5236 5236 5236 5236 ...
## $ nUM : num 16 16 16 16 16 16 16 16 16 16 ...
## $ nUH : num 30 30 30 30 30 30 30 30 30 30 ...
## $ Age.z : num [1:91001, 1] 0.0232 0.0232 0.0232 0.0232 0.0232 ...
## $ PhrasePos.z : num -0.675 -0.47 -0.47 -0.47 -0.675 ...
## $ BPhrasePos.z : num -0.421 -0.599 0.645 -0.599 1 ...
## $ Duration.z : num 0.999 0.617 -0.121 -0.351 -0.109 ...
## $ NextSilence.z: num -0.861 0.763 -0.861 1.332 -0.861 ...
## $ PrevSilence.z: num 1.225 -0.641 -0.641 -0.641 0.934 ...
## $ CallPos.z : num -0.319 -1.22 -0.539 0.461 -1.237 ...
## $ PLength.z : num -0.775 -0.775 0.181 -0.775 0.317 ...
## $ nWords.z : num [1:91001, 1] -0.187 -0.187 -0.187 -0.187 -0.187 ...
colnames(sb)
## [1] "CallerID" "Age" "Sex" "Call"
## [5] "CallID" "WordCount" "CallPos" "PhrasePos"
## [9] "BPhrasePos" "PLength" "FirstInPhrase" "LastInPhrase"
## [13] "Duration" "Word" "Calls" "Region"
## [17] "NextDuration" "NextWord" "PrevDuration" "PrevWord"
## [21] "PrevSilence" "NextSilence" "UM" "nWords"
## [25] "nUM" "nUH" "Age.z" "PhrasePos.z"
## [29] "BPhrasePos.z" "Duration.z" "NextSilence.z" "PrevSilence.z"
## [33] "CallPos.z" "PLength.z" "nWords.z"
sb$BirthYear = 1990 - sb$Age
s1 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard")
s2 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard","umwords")
s3 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard","uhwords")
s4 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard","uhmwords")
multiplot(s1,s2,s4,s3,cols=2)
# the model is not fitted here but loaded, as it takes more than one hour to fit
m <- glmer(UM ~ Age.z + Sex + FirstInPhrase + Age.z*LastInPhrase + Duration.z + PrevSilence.z +
Age.z*NextSilence.z + (1+Sex|CallID) + (0+Age.z|CallID) +
(1+NextSilence.z+PrevSilence.z+Duration.z+FirstInPhrase+LastInPhrase|CallerID),
data=sb, family='binomial',
control=glmerControl(optCtrl=list(maxfun=1e5),optimizer="bobyqa"))
save(m, file = 'm-sb.rda')
download.file('http://www.let.rug.nl/wieling/uhum/English/Switchboard/m-sb.rda','m-sb.rda')
load('m-sb.rda')
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## UM ~ Age.z + Sex + FirstInPhrase + LastInPhrase * Age.z + Duration.z +
## PrevSilence.z + NextSilence.z * Age.z + (1 + NextSilence.z +
## PrevSilence.z + Duration.z + FirstInPhrase + LastInPhrase |
## CallerID) + (1 + Sex | CallID) + (0 + Age.z | CallID)
## Data: sb
## Control:
## glmerControl(optCtrl = list(maxfun = 1e+05), optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 63207.3 63537.0 -31568.7 63137.3 90966
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -94.793 -0.364 -0.168 -0.038 19.234
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## CallID Age.z 0.12008 0.3465
## CallID.1 (Intercept) 0.56653 0.7527
## SexM 0.46642 0.6829 -0.51
## CallerID (Intercept) 2.22923 1.4931
## NextSilence.z 0.02800 0.1673 -0.23
## PrevSilence.z 0.03641 0.1908 -0.29 0.53
## Duration.z 0.22828 0.4778 0.65 -0.19 -0.11
## FirstInPhrase 0.48557 0.6968 -0.62 -0.02 0.13 -0.50
## LastInPhrase 0.23523 0.4850 -0.50 0.08 0.13 -0.47 0.31
## Number of obs: 91001, groups: CallID, 2438; CallerID, 520
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.04340 0.10054 -20.324 < 2e-16 ***
## Age.z -0.65581 0.06197 -10.583 < 2e-16 ***
## SexM -1.03184 0.11481 -8.988 < 2e-16 ***
## FirstInPhrase 0.67146 0.05402 12.430 < 2e-16 ***
## LastInPhrase 1.05908 0.04514 23.461 < 2e-16 ***
## Duration.z 0.87064 0.02775 31.374 < 2e-16 ***
## PrevSilence.z 0.11561 0.02131 5.424 5.83e-08 ***
## NextSilence.z 0.11295 0.01870 6.041 1.53e-09 ***
## Age.z:LastInPhrase 0.09569 0.04529 2.113 0.03460 *
## Age.z:NextSilence.z 0.05878 0.01938 3.032 0.00243 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Age.z SexM FrstIP LstInP Drtn.z PrvSl. NxtSl. A.:LIP
## Age.z -0.083
## SexM -0.624 0.147
## FirstInPhrs -0.434 0.005 0.044
## LastInPhras -0.404 -0.007 0.013 0.122
## Duration.z 0.378 -0.012 -0.038 -0.294 -0.251
## PrevSilnc.z 0.026 -0.001 -0.012 -0.508 0.050 -0.057
## NextSilnc.z 0.048 -0.024 -0.002 0.022 -0.476 -0.064 0.097
## Ag.z:LstInP -0.009 -0.475 0.007 -0.004 -0.007 0.003 0.003 0.036
## Ag.z:NxtSl. -0.012 0.151 0.001 0.002 0.030 0.015 -0.005 -0.067 -0.536
# model performance
somers2(fitted(m), sb$UM)
## C Dxy n Missing
## 9.184639e-01 8.369279e-01 9.100100e+04 0.000000e+00
Older people and men are relatively less likely to use UM than younger people and women. When the hesitation marker’s duration is longer, when it is preceded or followed by a longer silence, or when it is the first or the last word in the phrase (i.e. if it is preceded or followed by a pause) it is more likely that the hesitation marker is equal to UM. The duration of the following pause and if the hesitation marker is the final word in the phrase are more predictive for the occurrence of UM than for younger people. This suggests that younger people are using UM also more when it is not the final word in the sentence and to mark shorter pauses.
sb2 = unique(sb[,c("CallerID","Age","Age.z","Sex","nWords","nUM","nUH")])
dim(sb2)
## [1] 520 7
# average proportion UM as opposed to UH per speaker
mean(sb2$nUM / (sb2$nUM + sb2$nUH), na.rm=T)
## [1] 0.2825083
# percentage UM of all words
mean(sb2$nUM / sb2$nWords)
## [1] 0.007514793
# percentage UH of all words
mean(sb2$nUH / sb2$nWords)
## [1] 0.02207552
# Older people and men are relatively less likely to use UM
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1|CallerID), data=sb2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1 | CallerID)
## Data: sb2
##
## AIC BIC logLik deviance df.resid
## 4572.1 4589.1 -2282.1 4564.1 516
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.54416 -0.19297 0.01641 0.11739 0.86094
##
## Random effects:
## Groups Name Variance Std.Dev.
## CallerID (Intercept) 0.8562 0.9253
## Number of obs: 520, groups: CallerID, 520
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.06100 0.06198 -81.66 < 2e-16 ***
## SexM -0.48963 0.08561 -5.72 1.07e-08 ***
## Age.z -0.33064 0.04299 -7.69 1.46e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexM
## SexM -0.725
## Age.z -0.059 0.091
# Older people and men are relatively more likely to use UH
# (the age effect is stronger for women than for men)
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1|CallerID), data=sb2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1 | CallerID)
## Data: sb2
##
## AIC BIC logLik deviance df.resid
## 5413.9 5435.2 -2701.9 5403.9 515
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.42459 -0.14327 0.00744 0.09780 1.02197
##
## Random effects:
## Groups Name Variance Std.Dev.
## CallerID (Intercept) 0.3643 0.6036
## Number of obs: 520, groups: CallerID, 520
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.47501 0.04107 -108.97 < 2e-16 ***
## SexM 0.79822 0.05536 14.42 < 2e-16 ***
## SexF:Age.z 0.31207 0.03986 7.83 4.89e-15 ***
## SexM:Age.z 0.20290 0.03823 5.31 1.11e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexM SxF:A.
## SexM -0.741
## SexF:Age.z -0.130 0.096
## SexM:Age.z 0.000 0.044 0.000
# Older people and men are relatively more likely to use either UH or UM
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1|CallerID), data=sb2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1 | CallerID)
## Data: sb2
##
## AIC BIC logLik deviance df.resid
## 5657.0 5674.0 -2824.5 5649.0 516
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.69908 -0.09884 0.00858 0.10785 0.91325
##
## Random effects:
## Groups Name Variance Std.Dev.
## CallerID (Intercept) 0.2731 0.5226
## Number of obs: 520, groups: CallerID, 520
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.89140 0.03494 -111.37 < 2e-16 ***
## SexM 0.46459 0.04754 9.77 < 2e-16 ***
## Age.z 0.10218 0.02373 4.31 1.66e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexM
## SexM -0.737
## Age.z -0.076 0.090
download.file('http://www.let.rug.nl/wieling/uhum/English/Fisher/fisher.rda','fisher.rda')
load('fisher.rda')
dim(fisher)
## [1] 19753 16
str(fisher)
## 'data.frame': 19753 obs. of 16 variables:
## $ SpeakerID : Factor w/ 10313 levels "1006","1007",..: 2517 2517 2518 2518 2518 2519 2520 2520 2520 1 ...
## $ Call : Factor w/ 19753 levels "00001_A","00001_B",..: 8401 8514 13335 13471 13638 5429 18105 18383 18438 651 ...
## $ CallID : Factor w/ 11355 levels "00001","00002",..: 4740 4804 7527 7607 7703 3048 10311 10493 10527 391 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 2 1 ...
## $ Age : int 18 18 31 31 31 33 38 38 38 19 ...
## $ Edu : int 11 11 18 18 18 16 12 12 12 13 ...
## $ Loc : Factor w/ 93 levels "Afghanistan",..: 19 19 91 91 91 22 70 70 70 70 ...
## $ nWordsPerCall : int 739 644 689 802 516 1242 970 1296 1050 416 ...
## $ nUMPerCall : int 2 5 16 14 12 15 8 9 8 10 ...
## $ nUHPerCall : int 0 1 0 0 1 0 2 16 31 0 ...
## $ nWords : int 1383 1383 2007 2007 2007 1242 3316 3316 3316 416 ...
## $ nUM : int 7 7 42 42 42 15 25 25 25 10 ...
## $ nUH : int 1 1 1 1 1 0 49 49 49 0 ...
## $ Age.z : num -1.405 -1.405 -0.399 -0.399 -0.399 ...
## $ Edu.z : num -1.33 -1.33 1.11 1.11 1.11 ...
## $ nWordsPerCall.z: num -0.683 -1.004 -0.852 -0.471 -1.436 ...
colnames(fisher)
## [1] "SpeakerID" "Call" "CallID"
## [4] "Sex" "Age" "Edu"
## [7] "Loc" "nWordsPerCall" "nUMPerCall"
## [10] "nUHPerCall" "nWords" "nUM"
## [13] "nUH" "Age.z" "Edu.z"
## [16] "nWordsPerCall.z"
fisher$BirthYear = 2002 - fisher$Age
fi1 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","umuh","nUM")
fi2 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","umwords","nUM")
fi3 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","uhwords","nUM")
fi4 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","uhmwords","nUM")
multiplot(fi1,fi2,fi4,fi3,cols=2)
# takes about 5 minutes
m <- glmer(cbind(nUMPerCall,nUHPerCall) ~ Sex + Age.z + Edu.z + nWordsPerCall.z + (1|SpeakerID) + (0+Age.z|CallID) +
(1|CallID), data=fisher, family='binomial')
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.0017274 (tol =
## 0.001, component 1)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUMPerCall, nUHPerCall) ~ Sex + Age.z + Edu.z + nWordsPerCall.z +
## (1 | SpeakerID) + (0 + Age.z | CallID) + (1 | CallID)
## Data: fisher
##
## AIC BIC logLik deviance df.resid
## 84210.3 84273.4 -42097.1 84194.3 19745
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2889 -0.2048 0.0729 0.4642 2.2729
##
## Random effects:
## Groups Name Variance Std.Dev.
## CallID (Intercept) 3.58793 1.8942
## CallID.1 Age.z 0.07864 0.2804
## SpeakerID (Intercept) 1.12218 1.0593
## Number of obs: 19753, groups: CallID, 11355; SpeakerID, 10313
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.90785 0.02942 64.84 < 2e-16 ***
## SexM -1.36907 0.03696 -37.04 < 2e-16 ***
## Age.z -0.39052 0.01718 -22.74 < 2e-16 ***
## Edu.z 0.10539 0.01639 6.43 1.27e-10 ***
## nWordsPerCall.z 0.08612 0.01347 6.39 1.63e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexM Age.z Edu.z
## SexM -0.552
## Age.z -0.094 0.129
## Edu.z 0.029 -0.073 -0.077
## nWrdsPrCll. 0.003 -0.058 -0.144 -0.024
## convergence code: 0
## Model failed to converge with max|grad| = 0.0017274 (tol = 0.001, component 1)
# model performance
somers2( fitted(m), (fisher$nUMPerCall > fisher$nUHPerCall)*1 )
## C Dxy n Missing
## 9.643999e-01 9.287997e-01 1.975300e+04 0.000000e+00
somers2( fitted(m), fisher$nUMPerCall / (fisher$nUMPerCall+fisher$nUHPerCall) )
## C Dxy n Missing
## 9.768619e-01 9.537238e-01 1.910100e+04 6.520000e+02
Older people and men are relatively less likely to use UM than younger people and women. People who received more education are more likely to use UM than people who received less education. Finally, the more words people are pronouncing in a call, the more likely they are to use UM as opposed to UH.
fisher2 = unique(fisher[,c("SpeakerID","Age","Age.z","Edu","Edu.z","Sex","nWords","nUM","nUH")])
dim(fisher2)
## [1] 10313 9
# average proportion UM as opposed to UH per speaker
mean(fisher2$nUM / (fisher2$nUM + fisher2$nUH), na.rm=T)
## [1] 0.6407968
# percentage UM of all words
mean(fisher2$nUM / fisher2$nWords)
## [1] 0.009861317
# percentage UH of all words
mean(fisher2$nUH / fisher2$nWords)
## [1] 0.006765215
# Older people, lower educated people, and men are relatively less likely to use UM
# (the age and education effects are strongest for men; the education effect is stronger for older people)
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Sex + Sex:Age.z + Sex:Edu.z + Age.z:Edu.z+ (1|SpeakerID), data=fisher2,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUM, nWords - nUM) ~ Sex + Sex:Age.z + Sex:Edu.z + Age.z:Edu.z +
## (1 | SpeakerID)
## Data: fisher2
##
## AIC BIC logLik deviance df.resid
## 76813.0 76870.9 -38398.5 76797.0 10305
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.76962 -0.28271 0.02718 0.18178 1.10784
##
## Random effects:
## Groups Name Variance Std.Dev.
## SpeakerID (Intercept) 0.6849 0.8276
## Number of obs: 10313, groups: SpeakerID, 10313
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.791767 0.011466 -417.9 < 2e-16 ***
## SexM -0.352057 0.018350 -19.2 < 2e-16 ***
## SexF:Age.z -0.065989 0.011796 -5.6 2.22e-08 ***
## SexM:Age.z -0.137460 0.014076 -9.8 < 2e-16 ***
## SexF:Edu.z 0.056993 0.011470 5.0 6.74e-07 ***
## SexM:Edu.z 0.110022 0.014525 7.6 3.60e-14 ***
## Age.z:Edu.z 0.042092 0.008816 4.8 1.80e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexM SxF:A. SxM:A. SxF:E. SxM:E.
## SexM -0.613
## SexF:Age.z -0.108 0.060
## SexM:Age.z 0.002 0.135 0.000
## SexF:Edu.z 0.043 -0.013 -0.028 -0.001
## SexM:Edu.z -0.003 -0.100 0.007 -0.178 -0.012
## Age.z:Edu.z -0.022 -0.069 0.102 0.003 -0.187 0.066
# Older people and men are relatively more likely to use UH than younger people and wormen
# (the age effect is strongest for women)
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + Edu.z + (1|SpeakerID), data=fisher2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + Edu.z + (1 | SpeakerID)
## Data: fisher2
##
## AIC BIC logLik deviance df.resid
## 68396.7 68440.2 -34192.4 68384.7 10307
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.37056 -0.39042 0.03815 0.15388 1.34505
##
## Random effects:
## Groups Name Variance Std.Dev.
## SpeakerID (Intercept) 1.538 1.24
## Number of obs: 10313, groups: SpeakerID, 10313
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.16368 0.01871 -329.5 <2e-16 ***
## SexM 0.98717 0.02762 35.7 <2e-16 ***
## Edu.z -0.02255 0.01364 -1.7 0.0983 .
## SexF:Age.z 0.34914 0.01828 19.1 <2e-16 ***
## SexM:Age.z 0.20889 0.01994 10.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexM Edu.z SxF:A.
## SexM -0.649
## Edu.z 0.038 -0.077
## SexF:Age.z -0.169 0.110 0.006
## SexM:Age.z -0.010 0.099 -0.112 0.000
# Men, older people and higher educated people are relatively more likely to use either UH or UM
# (the education effect is strongest for older people)
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z*Edu.z + (1|SpeakerID), data=fisher2,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z * Edu.z +
## (1 | SpeakerID)
## Data: fisher2
##
## AIC BIC logLik deviance df.resid
## 85657.2 85700.6 -42822.6 85645.2 10307
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.06427 -0.22205 0.02361 0.16397 0.85843
##
## Random effects:
## Groups Name Variance Std.Dev.
## SpeakerID (Intercept) 0.5099 0.7141
## Number of obs: 10313, groups: SpeakerID, 10313
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.404077 0.009791 -449.8 < 2e-16 ***
## SexM 0.209628 0.015382 13.6 < 2e-16 ***
## Age.z 0.063810 0.007561 8.4 < 2e-16 ***
## Edu.z 0.031664 0.007594 4.2 3.05e-05 ***
## Age.z:Edu.z 0.026231 0.007263 3.6 0.000304 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SexM Age.z Edu.z
## SexM -0.639
## Age.z -0.094 0.124
## Edu.z 0.043 -0.067 -0.086
## Age.z:Edu.z -0.012 -0.058 0.099 -0.123
download.file('http://www.let.rug.nl/wieling/uhum/English/PNC/pnc.rda','pnc.rda')
load('pnc.rda')
dim(pnc)
## [1] 25514 39
str(pnc)
## 'data.frame': 25514 obs. of 39 variables:
## $ idstring : Factor w/ 395 levels "PH00-1-1-","PH00-1-2-",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ birthyear : int 1979 1979 1979 1979 1979 1979 1979 1979 1979 1979 ...
## $ age : int 21 21 21 21 21 21 21 21 21 21 ...
## $ sex : Factor w/ 2 levels "f","m": 2 2 2 2 2 2 2 2 2 2 ...
## $ ethnicity : Factor w/ 36 levels "","a","g","g/i",..: 13 13 13 13 13 13 13 13 13 13 ...
## $ schooling : num 14 14 14 14 14 14 14 14 14 14 ...
## $ recyear : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ word : Factor w/ 4 levels "AND_UH","AND_UM",..: 3 3 4 3 3 3 3 3 4 3 ...
## $ start_time : num 24.4 35 37.9 44.5 57.6 ...
## $ end_time : num 24.7 35.2 38.3 44.7 57.8 ...
## $ vowel_start : num 24.4 35 37.9 44.5 57.6 ...
## $ vowel_end : num 24.7 35.2 38.1 44.7 57.8 ...
## $ nasal_start : num NA NA 38.1 NA NA ...
## $ nasal_end : num NA NA 38.3 NA NA ...
## $ next_seg : Factor w/ 49 levels "AA1","AA2","AE0",..: 42 22 44 17 11 44 44 44 44 44 ...
## $ next_seg_start : num 24.7 35.2 38.3 44.7 57.8 ...
## $ next_seg_end : num 24.9 35.4 38.4 44.7 57.9 ...
## $ next_silence_length : num 0 0 0.12 0 0 ...
## $ next_silence_length.log : num -5.3 -5.3 -2.08 -5.3 -5.3 ...
## $ chunk_start : num 24.4 35 37.9 44.5 57.6 ...
## $ chunk_end : num 25.3 37.1 38.8 45.6 59 ...
## $ transcribed : num 2811 2811 2811 2811 2811 ...
## $ total : num 2814 2814 2814 2814 2814 ...
## $ nvowels : int 3078 3078 3078 3078 3078 3078 3078 3078 3078 3078 ...
## $ duration : num 0.3 0.28 0.37 0.2 0.18 ...
## $ pausebefore : num 1 1 1 1 1 0 0 0 1 0 ...
## $ pauseafter : num 0 0 0 0 0 1 1 1 1 1 ...
## $ UM : num 0 0 1 0 0 0 0 0 1 0 ...
## $ nwords : int 6551 6551 6551 6551 6551 6551 6551 6551 6551 6551 ...
## $ nUM : num 60 60 60 60 60 60 60 60 60 60 ...
## $ nUH : num 90 90 90 90 90 90 90 90 90 90 ...
## $ birthyear.z : num 1.4 1.4 1.4 1.4 1.4 ...
## $ age.z : num -0.949 -0.949 -0.949 -0.949 -0.949 ...
## $ schooling.z : num 0.697 0.697 0.697 0.697 0.697 ...
## $ recyear.z : num 1.07 1.07 1.07 1.07 1.07 ...
## $ next_silence_length.z : num -0.0361 -0.0361 -0.0324 -0.0361 -0.0361 ...
## $ next_silence_length.log.z: num -1.4297 -1.4297 -0.0274 -1.4297 -1.4297 ...
## $ duration.z : num 0.265 0.14 0.705 -0.363 -0.488 ...
## $ nwords.z : num 1.13 1.13 1.13 1.13 1.13 ...
colnames(pnc)
## [1] "idstring" "birthyear"
## [3] "age" "sex"
## [5] "ethnicity" "schooling"
## [7] "recyear" "word"
## [9] "start_time" "end_time"
## [11] "vowel_start" "vowel_end"
## [13] "nasal_start" "nasal_end"
## [15] "next_seg" "next_seg_start"
## [17] "next_seg_end" "next_silence_length"
## [19] "next_silence_length.log" "chunk_start"
## [21] "chunk_end" "transcribed"
## [23] "total" "nvowels"
## [25] "duration" "pausebefore"
## [27] "pauseafter" "UM"
## [29] "nwords" "nUM"
## [31] "nUH" "birthyear.z"
## [33] "age.z" "schooling.z"
## [35] "recyear.z" "next_silence_length.z"
## [37] "next_silence_length.log.z" "duration.z"
## [39] "nwords.z"
p1 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC")
p2 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC","umwords",nwordVar='nwords')
p3 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC","uhwords",nwordVar='nwords')
p4 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC","uhmwords",nwordVar='nwords')
multiplot(p1,p2,p4,p3,cols=2)
n5 = visualizeUHUM(pnc,"idstring","recyear","sex","PNC")
n5
# takes a few minutes
m <- glmer(UM ~ sex * age.z + recyear.z + duration.z + pauseafter + age.z*next_silence_length.log.z +
(1+duration.z+pauseafter+next_silence_length.log.z|idstring),
family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa"))
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: UM ~ sex * age.z + recyear.z + duration.z + pauseafter + age.z *
## next_silence_length.log.z + (1 + duration.z + pauseafter +
## next_silence_length.log.z | idstring)
## Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 14197.7 14352.5 -7079.9 14159.7 25495
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -18.1707 -0.2717 -0.1085 0.0162 31.3339
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## idstring (Intercept) 1.8945 1.3764
## duration.z 0.5533 0.7438 0.48
## pauseafter 0.2606 0.5105 0.12 0.11
## next_silence_length.log.z 0.2260 0.4753 -0.14 -0.12 -0.10
## Number of obs: 25514, groups: idstring, 395
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.75359 0.12130 -14.456 < 2e-16 ***
## sexm -1.31029 0.18017 -7.272 3.53e-13 ***
## age.z -1.16971 0.10428 -11.217 < 2e-16 ***
## recyear.z 0.54344 0.08132 6.682 2.35e-11 ***
## duration.z 1.25023 0.05590 22.364 < 2e-16 ***
## pauseafter 0.59255 0.10983 5.395 6.85e-08 ***
## next_silence_length.log.z 0.54594 0.06422 8.501 < 2e-16 ***
## sexm:age.z -0.51507 0.17604 -2.926 0.003435 **
## age.z:next_silence_length.log.z 0.17922 0.04760 3.765 0.000166 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm age.z rcyr.z drtn.z pasftr nx__.. sxm:g.
## sexm -0.510
## age.z 0.033 0.040
## recyear.z 0.000 -0.065 -0.141
## duration.z 0.206 -0.104 -0.066 0.042
## pauseafter -0.415 -0.101 -0.020 0.005 0.023
## nxt_slnc_.. 0.203 0.025 -0.033 0.008 -0.045 -0.660
## sexm:age.z -0.013 0.110 -0.553 -0.022 -0.023 -0.001 -0.031
## ag.z:nx__.. -0.061 -0.016 -0.082 -0.007 0.011 -0.005 0.179 -0.046
# model performance
somers2(fitted(m), pnc$UM)
## C Dxy n Missing
## 9.497833e-01 8.995665e-01 2.551400e+04 0.000000e+00
Older people and men are relatively less likely to use UM than younger people and women (the age effect is strongest for men, as the interaction shows). People recorded in more recent years are more likely to use UM than people recorded earlier. When the hesitation marker’s duration is longer, when it is followed by a pause (of at least 200 ms.), and when the following pause is longer (especially so for older people), it is more likely that the hesitation marker is equal to UM. The presence of a pause before the hesitation marker does not significantly distinguish UM from UH (see below).
The following (slightly better fitting) model uses year of birth and age, rather than year of recording and age and shows that the age effect is only limited, as year of birth is the most important predictor. This means that the year of birth is the determining factor for the amount of UM people use, as opposed to their age. Furthermore, the predictive power of the duration of the silence following the hesitation marker is diminishing for people born later. This suggests that people born later are using UM more in general, so not only denoting longer pauses.
# takes a few minutes
m2 <- glmer(UM ~ sex + sex : birthyear.z + age.z + duration.z + pauseafter +
birthyear.z*next_silence_length.log.z +
(1+duration.z+pauseafter+next_silence_length.log.z|idstring),
family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa"))
summary(m2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: UM ~ sex + sex:birthyear.z + age.z + duration.z + pauseafter +
## birthyear.z * next_silence_length.log.z + (1 + duration.z +
## pauseafter + next_silence_length.log.z | idstring)
## Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 14193.0 14347.8 -7077.5 14155.0 25495
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -18.258 -0.272 -0.108 0.017 33.671
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## idstring (Intercept) 1.8419 1.3572
## duration.z 0.5552 0.7451 0.49
## pauseafter 0.2659 0.5157 0.16 0.12
## next_silence_length.log.z 0.2321 0.4818 -0.15 -0.10 -0.14
## Number of obs: 25514, groups: idstring, 395
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.75419 0.12039 -14.571 < 2e-16
## sexm -1.31878 0.17863 -7.383 1.55e-13
## age.z -0.32691 0.15898 -2.056 0.039750
## duration.z 1.24945 0.05594 22.336 < 2e-16
## pauseafter 0.58653 0.10978 5.343 9.16e-08
## birthyear.z 0.88750 0.17478 5.078 3.82e-07
## next_silence_length.log.z 0.55821 0.06476 8.620 < 2e-16
## sexm:birthyear.z 0.64698 0.18000 3.594 0.000325
## birthyear.z:next_silence_length.log.z -0.18944 0.04928 -3.844 0.000121
##
## (Intercept) ***
## sexm ***
## age.z *
## duration.z ***
## pauseafter ***
## birthyear.z ***
## next_silence_length.log.z ***
## sexm:birthyear.z ***
## birthyear.z:next_silence_length.log.z ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm age.z drtn.z pasftr brthy. nx__.. sxm:b.
## sexm -0.509
## age.z 0.009 -0.001
## duration.z 0.211 -0.104 -0.015
## pauseafter -0.411 -0.098 -0.009 0.022
## birthyear.z -0.013 -0.032 0.798 0.030 0.012
## nxt_slnc_.. 0.191 0.025 0.024 -0.034 -0.660 0.037
## sxm:brthyr. 0.014 -0.107 -0.017 0.019 -0.016 -0.349 0.049
## brthy.:__.. 0.075 0.015 0.025 -0.014 0.001 -0.028 -0.205 -0.043
# model performance
somers2(fitted(m2), pnc$UM)
## C Dxy n Missing
## 9.497548e-01 8.995096e-01 2.551400e+04 0.000000e+00
AIC(m) # model on the basis of age and year of recording
## [1] 14197.72
AIC(m2) # model on the basis of year of birth and age (better: lower AIC)
## [1] 14193
pnc2 = unique(pnc[,c("idstring","age","age.z","sex","recyear","recyear.z","nwords","nUM","nUH")])
dim(pnc2)
## [1] 395 9
# average proportion UM as opposed to UH per speaker
mean(pnc2$nUM / (pnc2$nUM + pnc2$nUH), na.rm=T)
## [1] 0.2765317
# percentage UM of all words
mean(pnc2$nUM / pnc2$nwords)
## [1] 0.004543936
# percentage UH of all words
mean(pnc2$nUH / pnc2$nwords)
## [1] 0.01321371
# Older people, people who were recorded earlier, and men are relatively less likely to use UM
summary(mUM <- glmer(cbind(nUM, nwords - nUM) ~ sex + age.z + recyear.z + (1|idstring), data=pnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUM, nwords - nUM) ~ sex + age.z + recyear.z + (1 | idstring)
## Data: pnc2
##
## AIC BIC logLik deviance df.resid
## 2680.0 2699.9 -1335.0 2670.0 390
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.33970 -0.33877 0.01887 0.14231 0.91321
##
## Random effects:
## Groups Name Variance Std.Dev.
## idstring (Intercept) 1.326 1.151
## Number of obs: 395, groups: idstring, 395
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.80473 0.08402 -69.09 < 2e-16 ***
## sexm -1.04225 0.13214 -7.89 3.08e-15 ***
## age.z -0.69226 0.06650 -10.41 < 2e-16 ***
## recyear.z 0.39701 0.06384 6.22 5.01e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm age.z
## sexm -0.605
## age.z 0.062 0.044
## recyear.z -0.039 -0.022 -0.131
# Younger people, people who were more recently recorded, and women are less likely to use UH
summary(mUH <- glmer(cbind(nUH, nwords - nUH) ~ sex + age.z + recyear.z + (1|idstring), data=pnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH, nwords - nUH) ~ sex + age.z + recyear.z + (1 | idstring)
## Data: pnc2
##
## AIC BIC logLik deviance df.resid
## 3512.8 3532.7 -1751.4 3502.8 390
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.52779 -0.20029 -0.00452 0.13236 0.90533
##
## Random effects:
## Groups Name Variance Std.Dev.
## idstring (Intercept) 0.5462 0.739
## Number of obs: 395, groups: idstring, 395
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.86681 0.05270 -92.34 < 2e-16 ***
## sexm 0.40644 0.07870 5.16 2.41e-07 ***
## age.z 0.46963 0.03962 11.85 < 2e-16 ***
## recyear.z -0.24388 0.03954 -6.17 6.95e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm age.z
## sexm -0.666
## age.z -0.047 0.010
## recyear.z 0.017 -0.003 -0.109
# Only older people are relatively more likely to use either UH or UM.
summary(mUHUM <- glmer(cbind(nUH + nUM, nwords - nUH - nUM) ~ sex + age.z + recyear.z + (1|idstring), data=pnc2,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH + nUM, nwords - nUH - nUM) ~ sex + age.z + recyear.z +
## (1 | idstring)
## Data: pnc2
##
## AIC BIC logLik deviance df.resid
## 3776.9 3796.8 -1883.4 3766.9 390
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.81074 -0.17015 0.01041 0.11135 0.63540
##
## Random effects:
## Groups Name Variance Std.Dev.
## idstring (Intercept) 0.4546 0.6742
## Number of obs: 395, groups: idstring, 395
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.27471 0.04703 -90.90 < 2e-16 ***
## sexm 0.09042 0.07094 1.27 0.202
## age.z 0.17214 0.03547 4.85 1.21e-06 ***
## recyear.z -0.04180 0.03530 -1.18 0.236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm age.z
## sexm -0.661
## age.z -0.016 -0.001
## recyear.z 0.000 0.001 -0.096
# Effect size of schooling (n.s.)
summary(m0 <- glmer(UM ~ sex + sex : age.z + recyear.z + duration.z + pauseafter + schooling.z
+ age.z*next_silence_length.log.z +
(1+duration.z+pauseafter+next_silence_length.log.z|idstring),
family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: UM ~ sex + sex:age.z + recyear.z + duration.z + pauseafter +
## schooling.z + age.z * next_silence_length.log.z + (1 + duration.z +
## pauseafter + next_silence_length.log.z | idstring)
## Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 13261.3 13422.8 -6610.7 13221.3 23716
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -17.3792 -0.2726 -0.1111 0.0059 30.1706
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## idstring (Intercept) 1.9516 1.3970
## duration.z 0.5718 0.7562 0.48
## pauseafter 0.2564 0.5063 0.06 0.13
## next_silence_length.log.z 0.1984 0.4454 -0.10 -0.08 -0.07
## Number of obs: 23736, groups: idstring, 361
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.75969 0.12737 -13.815 < 2e-16 ***
## sexm -1.35362 0.19247 -7.033 2.02e-12 ***
## recyear.z 0.51473 0.08807 5.845 5.07e-09 ***
## duration.z 1.24921 0.05897 21.185 < 2e-16 ***
## pauseafter 0.59326 0.11546 5.138 2.77e-07 ***
## schooling.z 0.03201 0.08752 0.366 0.7145
## age.z -1.15043 0.11161 -10.308 < 2e-16 ***
## next_silence_length.log.z 0.53945 0.06656 8.104 5.31e-16 ***
## sexm:age.z -0.43060 0.19086 -2.256 0.0241 *
## age.z:next_silence_length.log.z 0.16061 0.04957 3.240 0.0012 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm rcyr.z drtn.z pasftr schln. age.z nx__.. sxm:g.
## sexm -0.506
## recyear.z 0.027 -0.064
## duration.z 0.214 -0.114 0.030
## pauseafter -0.425 -0.099 0.015 0.025
## schooling.z 0.041 -0.126 -0.228 0.041 -0.019
## age.z 0.040 0.052 -0.073 -0.078 -0.015 -0.171
## nxt_slnc_.. 0.216 0.028 -0.017 -0.028 -0.666 0.028 -0.028
## sexm:age.z -0.025 0.099 -0.028 -0.010 0.017 0.051 -0.551 -0.048
## ag.z:nx__.. -0.051 -0.016 -0.030 0.011 -0.015 0.064 -0.086 0.197 -0.045
# Effect size of pause before hesitation marker (n.s.)
summary(m1 <- glmer(UM ~ sex * age.z + recyear.z + duration.z + pauseafter + pausebefore +
age.z*next_silence_length.log.z +
(1+duration.z+pauseafter+pausebefore+next_silence_length.log.z|idstring),
family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## UM ~ sex * age.z + recyear.z + duration.z + pauseafter + pausebefore +
## age.z * next_silence_length.log.z + (1 + duration.z + pauseafter +
## pausebefore + next_silence_length.log.z | idstring)
## Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 14087.0 14290.7 -7018.5 14037.0 25489
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -18.5707 -0.2646 -0.1044 0.0143 27.8927
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## idstring (Intercept) 2.3123 1.5206
## duration.z 0.5647 0.7514 0.53
## pauseafter 0.2338 0.4835 0.13 0.16
## pausebefore 0.8973 0.9473 -0.48 -0.15 -0.13
## next_silence_length.log.z 0.2518 0.5018 -0.12 -0.12 -0.17
##
##
##
##
##
## 0.03
## Number of obs: 25514, groups: idstring, 395
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.76487 0.13072 -13.501 < 2e-16 ***
## sexm -1.34238 0.17812 -7.536 4.84e-14 ***
## age.z -1.14816 0.10460 -10.976 < 2e-16 ***
## recyear.z 0.56672 0.08014 7.072 1.53e-12 ***
## duration.z 1.27442 0.05745 22.184 < 2e-16 ***
## pauseafter 0.61949 0.11095 5.583 2.36e-08 ***
## pausebefore -0.08419 0.08720 -0.965 0.334304
## next_silence_length.log.z 0.55114 0.06577 8.380 < 2e-16 ***
## sexm:age.z -0.49232 0.17459 -2.820 0.004804 **
## age.z:next_silence_length.log.z 0.18753 0.04949 3.789 0.000151 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm age.z rcyr.z drtn.z pasftr pasbfr nx__.. sxm:g.
## sexm -0.475
## age.z -0.003 0.042
## recyear.z -0.003 -0.065 -0.128
## duration.z 0.267 -0.116 -0.088 0.044
## pauseafter -0.404 -0.097 -0.008 0.008 0.028
## pausebefore -0.384 0.045 0.106 -0.013 -0.192 0.015
## nxt_slnc_.. 0.186 0.018 -0.037 0.011 -0.049 -0.663 0.029
## sexm:age.z -0.028 0.136 -0.533 -0.026 -0.031 -0.005 0.019 -0.025
## ag.z:nx__.. -0.061 -0.014 -0.084 -0.011 0.006 -0.003 0.023 0.168 -0.044
m <- glmer(UM ~ sex * age.z + recyear.z + duration.z + pauseafter + age.z*next_silence_length.log.z + (1+duration.z+pauseafter+next_silence_length.log.z|idstring), family=‘binomial’, data=pnc, control=glmerControl(optimizer=“bobyqa”)) summary(m)
download.file('http://www.let.rug.nl/wieling/uhum/English/BNC/bnc.rda','bnc.rda')
load('bnc.rda')
dim(bnc)
## [1] 25498 13
str(bnc)
## 'data.frame': 25498 obs. of 13 variables:
## $ speaker : Factor w/ 960 levels "PS002","PS003",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 60 60 60 60 60 60 60 60 60 60 ...
## $ gender : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 1 1 ...
## $ word : Factor w/ 2 levels "UH","UM": 1 1 1 1 2 1 2 2 2 1 ...
## $ duration : num 0.1 0.16 0.09 0.03 0.06 0.07 0.09 0.06 0.06 0.03 ...
## $ pauseafter : num 0.09 0.22 2.85 0.26 0.32 0.49 0.57 0.22 0.07 0.54 ...
## $ UM : num 0 0 0 0 1 0 1 1 1 0 ...
## $ nWords : int 3295 3295 3295 3295 3295 3295 3295 3295 3295 3295 ...
## $ nUM : num 14 14 14 14 14 14 14 14 14 14 ...
## $ nUH : num 14 14 14 14 14 14 14 14 14 14 ...
## $ age.z : num 1.1 1.1 1.1 1.1 1.1 ...
## $ duration.z : num -0.3961 0.0424 -0.4692 -0.9078 -0.6885 ...
## $ pauseafter.z: num -0.599 -0.482 1.88 -0.446 -0.392 ...
colnames(bnc)
## [1] "speaker" "age" "gender" "word"
## [5] "duration" "pauseafter" "UM" "nWords"
## [9] "nUM" "nUH" "age.z" "duration.z"
## [13] "pauseafter.z"
bnc$birthyear = 1993 - bnc$age
b1 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC")
b2 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC","umwords")
b3 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC","uhwords")
b4 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC","uhmwords")
multiplot(b1,b2,b4,b3,cols=2)
# takes about 80 seconds
m <- glmer(UM ~ age.z + gender + duration.z + pauseafter.z + (0+pauseafter.z|speaker) +
(1+duration.z+pauseafter.z|speaker),
data=bnc,family='binomial', control=glmerControl(optimizer="bobyqa"))
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## UM ~ age.z + gender + duration.z + pauseafter.z + (0 + pauseafter.z |
## speaker) + (1 + duration.z + pauseafter.z | speaker)
## Data: bnc
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 27372.2 27469.9 -13674.1 27348.2 25486
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -41.989 -0.656 -0.134 0.659 6.485
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## speaker pauseafter.z 0.06095 0.2469
## speaker.1 (Intercept) 2.21248 1.4874
## duration.z 0.56188 0.7496 0.75
## pauseafter.z 0.11419 0.3379 0.40 0.16
## Number of obs: 25498, groups: speaker, 960
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.44490 0.08056 5.523 3.34e-08 ***
## age.z -0.45377 0.04788 -9.477 < 2e-16 ***
## genderm -0.44894 0.09658 -4.648 3.35e-06 ***
## duration.z 1.06549 0.04716 22.594 < 2e-16 ***
## pauseafter.z 0.44329 0.03527 12.567 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z gendrm drtn.z
## age.z -0.027
## genderm -0.650 0.035
## duration.z 0.428 0.065 0.037
## pauseaftr.z 0.161 0.017 -0.010 0.019
# model performance
somers2(fitted(m), bnc$UM)
## C Dxy n Missing
## 8.532137e-01 7.064274e-01 2.549800e+04 0.000000e+00
Older people and men are relatively less likely to use UM than younger people and women. When the hesitation marker’s duration is longer, and when it is followed by longer pause it is more likely that the hesitation marker is equal to UM.
bnc2 = unique(bnc[,c("speaker","age","age.z","gender","nWords","nUM","nUH")])
dim(bnc2)
## [1] 960 7
# average proportion UM as opposed to UH per speaker
mean(bnc2$nUM / (bnc2$nUM + bnc2$nUH), na.rm=T)
## [1] 0.4611727
# percentage UM of all words
mean(bnc2$nUM / bnc2$nWords)
## [1] 0.004269795
# percentage UH of all words
mean(bnc2$nUH / bnc2$nWords)
## [1] 0.004511923
# Older people are relatively less likely to use UM
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ gender + age.z + (1|speaker), data=bnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ gender + age.z + (1 | speaker)
## Data: bnc2
##
## AIC BIC logLik deviance df.resid
## 5774.9 5794.4 -2883.4 5766.9 956
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.42764 -0.40072 -0.00744 0.18734 2.30172
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 1.112 1.055
## Number of obs: 960, groups: speaker, 960
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.11742 0.06023 -101.57 < 2e-16 ***
## genderm 0.09470 0.07998 1.18 0.23641
## age.z -0.12043 0.03978 -3.03 0.00247 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) gendrm
## genderm -0.734
## age.z -0.004 0.000
# Men and older people are relatively more likely to use UH
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ gender + age.z + (1|speaker), data=bnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ gender + age.z + (1 | speaker)
## Data: bnc2
##
## AIC BIC logLik deviance df.resid
## 5703.5 5722.9 -2847.7 5695.5 956
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7733 -0.3212 0.0101 0.2556 3.8537
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.6468 0.8042
## Number of obs: 960, groups: speaker, 960
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.20236 0.04888 -126.89 <2e-16 ***
## genderm 0.54884 0.06356 8.64 <2e-16 ***
## age.z 0.27783 0.03134 8.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) gendrm
## genderm -0.758
## age.z -0.073 0.006
# Older people and men are relatively more likely to use either UH or UM
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ gender + age.z + (1|speaker), data=bnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ gender + age.z + (1 |
## speaker)
## Data: bnc2
##
## AIC BIC logLik deviance df.resid
## 6828.5 6848.0 -3410.3 6820.5 956
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4357 -0.2617 -0.0042 0.2063 3.4355
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.5352 0.7316
## Number of obs: 960, groups: speaker, 960
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.30618 0.04092 -129.67 < 2e-16 ***
## genderm 0.35603 0.05412 6.58 4.75e-11 ***
## age.z 0.11544 0.02670 4.32 1.54e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) gendrm
## genderm -0.753
## age.z -0.028 -0.005
download.file('http://www.let.rug.nl/wieling/uhum/English/HCRC/hcrc.rda','hcrc.rda')
load('hcrc.rda')
dim(hcrc)
## [1] 1987 18
str(hcrc)
## 'data.frame': 1987 obs. of 18 variables:
## $ SpeakerID : Factor w/ 64 levels "q1eta1","q1eta2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 20 20 20 20 20 20 20 20 20 20 ...
## $ Sex : Factor w/ 2 levels "f","m": 2 2 2 2 2 2 2 2 2 2 ...
## $ ConversationID : Factor w/ 128 levels "q1ec1","q1ec2",..: 1 1 1 1 6 7 3 7 3 6 ...
## $ Role : Factor w/ 2 levels "f","g": 2 2 2 2 1 2 1 2 1 1 ...
## $ Origin : Factor w/ 22 levels "Aberdeen","Aldershot",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ Word : Factor w/ 7 levels "eh","ehm","er",..: 4 4 4 4 5 3 5 1 6 2 ...
## $ Position : int 8 16 8 9 1 1 1 3 8 1 ...
## $ nWordsInPhrase : int 29 29 18 11 2 17 8 3 13 5 ...
## $ FirstInPhrase : num 0 0 0 0 1 1 1 0 0 1 ...
## $ LastInPhrase : num 0 0 0 0 0 0 0 1 0 0 ...
## $ UM : num 1 1 1 1 1 0 1 0 0 1 ...
## $ nWords : int 2102 2102 2102 2102 2102 2102 2102 2102 2102 2102 ...
## $ nUM : num 7 7 7 7 7 7 7 7 7 7 ...
## $ nUH : num 3 3 3 3 3 3 3 3 3 3 ...
## $ Age.z : num 0.0975 0.0975 0.0975 0.0975 0.0975 ...
## $ Position.z : num 0.602 1.904 0.602 0.765 -0.538 ...
## $ nWordsInPhrase.z: num 1.6615 1.6615 0.6237 -0.0367 -0.8858 ...
colnames(hcrc)
## [1] "SpeakerID" "Age" "Sex"
## [4] "ConversationID" "Role" "Origin"
## [7] "Word" "Position" "nWordsInPhrase"
## [10] "FirstInPhrase" "LastInPhrase" "UM"
## [13] "nWords" "nUM" "nUH"
## [16] "Age.z" "Position.z" "nWordsInPhrase.z"
hcrc$BirthYear = 1990 - hcrc$Age
h1 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC")
h2 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC","umwords")
h3 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC","uhwords")
h4 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC","uhmwords")
multiplot(h1,h2,h4,h3,cols=2)
# takes about 1 second
m <- glmer(UM ~ Age.z + Sex + FirstInPhrase + LastInPhrase + (1+FirstInPhrase|SpeakerID) + (1|ConversationID),
data=hcrc, family='binomial', control=glmerControl(optimizer="bobyqa"))
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## UM ~ Age.z + Sex + FirstInPhrase + LastInPhrase + (1 + FirstInPhrase |
## SpeakerID) + (1 | ConversationID)
## Data: hcrc
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 2008.2 2058.6 -995.1 1990.2 1978
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2692 -0.5310 0.2290 0.5218 4.1452
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ConversationID (Intercept) 0.4517 0.6721
## SpeakerID (Intercept) 1.7322 1.3161
## FirstInPhrase 0.8588 0.9267 -0.28
## Number of obs: 1987, groups: ConversationID, 128; SpeakerID, 64
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.8774 0.2867 3.061 0.00221 **
## Age.z -0.3500 0.1924 -1.819 0.06886 .
## Sexm -2.2966 0.3935 -5.836 5.35e-09 ***
## FirstInPhrase 0.8340 0.1901 4.387 1.15e-05 ***
## LastInPhrase 1.0688 0.1505 7.101 1.24e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Age.z Sexm FrstIP
## Age.z -0.016
## Sexm -0.673 0.027
## FirstInPhrs -0.258 0.021 -0.072
## LastInPhras -0.134 -0.004 -0.024 0.066
# model performance
somers2(fitted(m), hcrc$UM)
## C Dxy n Missing
## 0.8881135 0.7762270 1987.0000000 0.0000000
range(hcrc$Age)
## [1] 17 30
Men are relatively less likely to use UM than women. When the hesitation marker is located at the beginning or the end of a phrase, it is more likely that the hesitation marker is equal to UM. While the direction of the age effect is as predicted, it is not significant. The likely reason for this is the relatively small age range in this data set.
hcrc2 = unique(hcrc[,c("SpeakerID","Age","Age.z","Sex","nWords","nUM","nUH")])
dim(hcrc2)
## [1] 64 7
# average proportion UM as opposed to UH per speaker
mean(hcrc2$nUM / (hcrc2$nUM + hcrc2$nUH), na.rm=T)
## [1] 0.5716515
# percentage UM of all words
mean(hcrc2$nUM / hcrc2$nWords)
## [1] 0.008127415
# percentage UH of all words
mean(hcrc2$nUH / hcrc2$nWords)
## [1] 0.005806336
# Older people and men are relatively less likely to use UM
# (note, however, the limited age range in this dataset)
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1|SpeakerID), data=hcrc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1 | SpeakerID)
## Data: hcrc2
##
## AIC BIC logLik deviance df.resid
## 483.4 492.0 -237.7 475.4 60
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.64297 -0.32997 0.03752 0.18109 0.57951
##
## Random effects:
## Groups Name Variance Std.Dev.
## SpeakerID (Intercept) 0.6941 0.8331
## Number of obs: 64, groups: SpeakerID, 64
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.741359 0.002803 -1691.7 <2e-16 ***
## Sexm -1.027418 0.002803 -366.6 <2e-16 ***
## Age.z -0.358405 0.002802 -127.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sexm
## Sexm 0.001
## Age.z 0.001 0.001
## convergence code: 0
## Model failed to converge with max|grad| = 0.0252223 (tol = 0.001, component 1)
# Men are more likely to use UH than women, but older men are less likely to use UH,
# whereas older women are relatively more likely to use UH
# (note, however, the limited age range in this dataset)
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1|SpeakerID), data=hcrc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1 | SpeakerID)
## Data: hcrc2
##
## AIC BIC logLik deviance df.resid
## 446.2 457.0 -218.1 436.2 59
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.21354 -0.46214 0.03552 0.23155 0.57126
##
## Random effects:
## Groups Name Variance Std.Dev.
## SpeakerID (Intercept) 0.7567 0.8699
## Number of obs: 64, groups: SpeakerID, 64
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.124447 0.002195 -2790.0 <2e-16 ***
## Sexm 0.984824 0.002193 449.1 <2e-16 ***
## Sexf:Age.z 0.144781 0.002193 66.0 <2e-16 ***
## Sexm:Age.z -0.125410 0.002193 -57.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sexm Sxf:A.
## Sexm 0.000
## Sexf:Age.z 0.000 0.000
## Sexm:Age.z 0.000 0.000 0.000
## convergence code: 0
## Model failed to converge with max|grad| = 0.0322403 (tol = 0.001, component 1)
# Older people are less likely to use either UH or UM
# (note, however, the limited age range in this dataset)
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1|SpeakerID), data=hcrc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1 | SpeakerID)
## Data: hcrc2
##
## AIC BIC logLik deviance df.resid
## 539.7 548.4 -265.9 531.7 60
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.01320 -0.28313 -0.02075 0.20122 0.46859
##
## Random effects:
## Groups Name Variance Std.Dev.
## SpeakerID (Intercept) 0.3825 0.6185
## Number of obs: 64, groups: SpeakerID, 64
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.4138 0.1160 -38.05 <2e-16 ***
## Sexm -0.1133 0.1643 -0.69 0.4905
## Age.z -0.1965 0.0841 -2.34 0.0195 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sexm
## Sexm -0.704
## Age.z -0.007 0.028
download.file('http://www.let.rug.nl/wieling/uhum/German/ger.rda','ger.rda')
load('ger.rda')
dim(ger)
## [1] 16221 9
str(ger)
## 'data.frame': 16221 obs. of 9 variables:
## $ Speaker : Factor w/ 238 levels "FOLK_S_00027",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BirthYear : int 1967 1967 1967 1967 1967 1967 1967 1967 1967 1967 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ Interview : Factor w/ 234 levels "FOLK_E_00002_SE_01_T_01_DF_01",..: 3 3 3 3 3 3 4 4 4 4 ...
## $ Variant : Factor w/ 4 levels "äh","ähm","öh",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ UM : num 0 0 1 0 0 0 0 0 0 0 ...
## $ nUM : num 9 9 9 9 9 9 9 9 9 9 ...
## $ nUH : num 71 71 71 71 71 71 71 71 71 71 ...
## $ BirthYear.z: num -0.868 -0.868 -0.868 -0.868 -0.868 ...
colnames(ger)
## [1] "Speaker" "BirthYear" "Gender" "Interview" "Variant"
## [6] "UM" "nUM" "nUH" "BirthYear.z"
visualizeUHUM(ger,"Speaker","BirthYear","Gender","German")
# takes about 40 seconds
m <- glmer(UM ~ BirthYear.z + Gender + (1|Speaker) + (0+BirthYear.z|Interview) +
(1+Gender|Interview), family='binomial', data=ger)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: UM ~ BirthYear.z + Gender + (1 | Speaker) + (0 + BirthYear.z |
## Interview) + (1 + Gender | Interview)
## Data: ger
##
## AIC BIC logLik deviance df.resid
## 17299.0 17360.6 -8641.5 17283.0 16213
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6928 -0.6493 -0.2731 0.6959 6.1829
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Speaker (Intercept) 0.92586 0.9622
## Interview BirthYear.z 0.02069 0.1438
## Interview.1 (Intercept) 0.07078 0.2660
## GenderM 0.41934 0.6476 -0.80
## Number of obs: 16221, groups: Speaker, 238; Interview, 234
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.12511 0.10850 1.153 0.24889
## BirthYear.z 0.94290 0.08197 11.503 < 2e-16 ***
## GenderM -0.43433 0.16102 -2.697 0.00699 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) BrthY.
## BirthYear.z -0.047
## GenderM -0.688 0.083
# model performance
somers2(fitted(m), ger$UM)
## C Dxy n Missing
## 8.163982e-01 6.327963e-01 1.622100e+04 0.000000e+00
Older people (i.e. with a lower year of birth) and men are relatively less likely to use UM than younger people and women.
ger2 = unique(ger[,c("Speaker","BirthYear","Gender","BirthYear.z","nUM","nUH")])
dim(ger2)
## [1] 238 6
# average proportion UM as opposed to UH per speaker
mean(ger2$nUM / (ger2$nUM + ger2$nUH), na.rm=T)
## [1] 0.501731
# Note that the number of words per speaker was not available in this dataset,
# so additional models (assessing the relative frequency of hesitation markers
# with respect to all other words) are not possible.
download.file('http://www.let.rug.nl/wieling/uhum/Norwegian/nor.rda','nor.rda')
load('nor.rda')
dim(nor)
## [1] 47604 13
table(nor$AgeGroup,nor$RecordingYear) # unequal distribution
##
## 1951 1956 1958 1959 1960 1962 1963 1964 1965 1967 1968 1969 1970
## Old 138 263 159 162 128 29 119 8 88 452 1312 987 220
## Young 0 0 0 0 0 0 0 0 0 0 62 0 0
##
## 1971 1972 1973 1974 1975 1976 1978 1979 1980 1984 2006 2007 2008
## Old 268 185 799 509 141 191 198 1234 22 21 867 3147 9100
## Young 0 0 0 0 0 3 0 0 0 0 84 1996 6455
##
## 2009 2010 2011 2012
## Old 6099 5372 332 81
## Young 3634 2154 493 75
str(nor)
## 'data.frame': 47604 obs. of 13 variables:
## $ Speaker : Factor w/ 554 levels "aal_01um","aal_02uk",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AgeGroup : Factor w/ 2 levels "Old","Young": 2 2 2 2 2 2 2 2 2 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Region : Factor w/ 5 levels "Nord-Norge","Østlandet",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Area : Factor w/ 18 levels "Akershus","Aust-Agder",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Place : Factor w/ 163 levels "Ål","Alvdal",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ RecordingYear : int 2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
## $ Form : Factor w/ 9 levels "e","E","em","EM",..: 1 1 3 1 3 1 1 1 6 1 ...
## $ UM : int 0 0 1 0 1 0 0 0 1 0 ...
## $ nWords : int 3824 3824 3824 3824 3824 3824 3824 3824 3824 3824 ...
## $ nUM : int 36 36 36 36 36 36 36 36 36 36 ...
## $ nUH : num 101 101 101 101 101 101 101 101 101 101 ...
## $ RecordingYear.z: num 0.484 0.484 0.484 0.484 0.484 ...
colnames(nor)
## [1] "Speaker" "AgeGroup" "Gender"
## [4] "Region" "Area" "Place"
## [7] "RecordingYear" "Form" "UM"
## [10] "nWords" "nUM" "nUH"
## [13] "RecordingYear.z"
n1 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian")
n2 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian","umwords")
n3 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian","uhwords")
n4 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian","uhmwords")
multiplot(n1,n2,n4,n3,cols=2)
n5 = visualizeUHUM(nor,"Speaker","RecordingYear","Gender","Norwegian")
n5
# takes about 30 seconds
m <- glmer(UM ~ AgeGroup + Gender + RecordingYear.z + (1|Speaker), family='binomial', data=nor)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: UM ~ AgeGroup + Gender + RecordingYear.z + (1 | Speaker)
## Data: nor
##
## AIC BIC logLik deviance df.resid
## 32281.3 32325.1 -16135.6 32271.3 47582
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.0661 -0.3865 -0.2880 -0.1800 8.2064
##
## Random effects:
## Groups Name Variance Std.Dev.
## Speaker (Intercept) 0.8291 0.9106
## Number of obs: 47587, groups: Speaker, 553
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.42837 0.07829 -31.019 < 2e-16 ***
## AgeGroupYoung 0.64721 0.09546 6.780 1.20e-11 ***
## GenderMale -0.23132 0.09071 -2.550 0.0108 *
## RecordingYear.z 0.34890 0.05869 5.945 2.77e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) AgGrpY GndrMl
## AgeGroupYng -0.485
## GenderMale -0.646 0.013
## RecrdngYr.z -0.082 -0.318 0.156
# model performance
somers2(fitted(m), nor[!is.na(nor$RecordingYear),]$UM)
## C Dxy n Missing
## 7.582430e-01 5.164861e-01 4.758700e+04 0.000000e+00
Older people and men are relatively less likely to use UM than younger people and women. People recorded in more recent years are more likely to use UM than people recorded earlier.
nor2 = unique(nor[,c("Speaker","AgeGroup","Gender","RecordingYear","RecordingYear.z","nWords","nUM","nUH")])
dim(nor2)
## [1] 554 8
# average proportion UM as opposed to UH per speaker
mean(nor2$nUM / (nor2$nUM + nor2$nUH), na.rm=T)
## [1] 0.1285163
# percentage UM of all words
mean(nor2$nUM / nor2$nWords)
## [1] 0.002645844
# percentage UH of all words
mean(nor2$nUH / nor2$nWords)
## [1] 0.01885864
# Younger people are relatively more likely than older people to use UM, older people (but not younger people)
# who were more recently recorded are relatively more likely to use UM
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Gender + AgeGroup + AgeGroup:RecordingYear.z + (1|Speaker), data=nor2,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUM, nWords - nUM) ~ Gender + AgeGroup + AgeGroup:RecordingYear.z +
## (1 | Speaker)
## Data: nor2
##
## AIC BIC logLik deviance df.resid
## 3541.3 3567.2 -1764.7 3529.3 547
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.43065 -0.35547 0.00705 0.18730 1.67140
##
## Random effects:
## Groups Name Variance Std.Dev.
## Speaker (Intercept) 0.8852 0.9408
## Number of obs: 553, groups: Speaker, 553
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.55851 0.07986 -82.13 < 2e-16 ***
## GenderMale 0.03070 0.09214 0.33 0.738992
## AgeGroupYoung 0.73395 0.19055 3.85 0.000117 ***
## AgeGroupOld:RecordingYear.z 0.38582 0.06039 6.39 1.67e-10 ***
## AgeGroupYoung:RecordingYear.z -0.49511 0.33593 -1.47 0.140524
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GndrMl AgGrpY AGO:RY
## GenderMale -0.641
## AgeGroupYng -0.240 0.002
## AgGrpOl:RY. -0.077 0.151 -0.014
## AgGrpYn:RY. -0.024 0.033 -0.876 0.007
# Men and older people are relatively more likely to use UH
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Gender + AgeGroup + Gender + RecordingYear.z + (1|Speaker), data=nor2,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH, nWords - nUH) ~ Gender + AgeGroup + Gender + RecordingYear.z +
## (1 | Speaker)
## Data: nor2
##
## AIC BIC logLik deviance df.resid
## 5389.0 5410.5 -2689.5 5379.0 548
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.60536 -0.16780 0.00832 0.12258 0.87358
##
## Random effects:
## Groups Name Variance Std.Dev.
## Speaker (Intercept) 0.3365 0.5801
## Number of obs: 553, groups: Speaker, 553
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.14330 0.04423 -93.67 < 2e-16 ***
## GenderMale 0.27360 0.05232 5.23 1.70e-07 ***
## AgeGroupYoung -0.33226 0.05677 -5.85 4.82e-09 ***
## RecordingYear.z 0.02952 0.02921 1.01 0.312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GndrMl AgGrpY
## GenderMale -0.648
## AgeGroupYng -0.493 0.003
## RecrdngYr.z 0.052 0.157 -0.370
# Men and older people are also relatively more likely to either use UM or UH
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + AgeGroup + RecordingYear.z + (1|Speaker),
data=nor2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + AgeGroup + RecordingYear.z +
## (1 | Speaker)
## Data: nor2
##
## AIC BIC logLik deviance df.resid
## 5516.4 5538.0 -2753.2 5506.4 548
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.72681 -0.15603 0.01666 0.12143 0.91305
##
## Random effects:
## Groups Name Variance Std.Dev.
## Speaker (Intercept) 0.3209 0.5665
## Number of obs: 553, groups: Speaker, 553
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.01489 0.04308 -93.19 < 2e-16 ***
## GenderMale 0.24686 0.05092 4.85 1.25e-06 ***
## AgeGroupYoung -0.24269 0.05522 -4.40 1.11e-05 ***
## RecordingYear.z 0.05309 0.02850 1.86 0.0625 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) GndrMl AgGrpY
## GenderMale -0.647
## AgeGroupYng -0.495 0.005
## RecrdngYr.z 0.052 0.157 -0.370
download.file('http://www.let.rug.nl/wieling/uhum/Danish-Faroese/danfar.rda','danfar.rda')
load('danfar.rda')
dim(danfar)
## [1] 4504 17
str(danfar)
## 'data.frame': 4504 obs. of 17 variables:
## $ speaker : Factor w/ 57 levels "Interviewer2",..: 49 49 49 49 49 49 49 49 49 49 ...
## $ age : num 26 26 26 26 26 26 26 26 26 26 ...
## $ sex : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 1 1 ...
## $ lang : Factor w/ 2 levels "dan","far": 1 1 1 1 1 1 1 1 1 1 ...
## $ word : Factor w/ 50 levels "Eeeeh","eeeehh",..: 42 42 42 42 42 45 42 42 42 42 ...
## $ type : Factor w/ 2 levels "uh","uhm": 1 1 1 1 1 2 1 1 1 1 ...
## $ UM : num 0 0 0 0 0 1 0 0 0 0 ...
## $ nWords : num 25462 25462 25462 25462 25462 ...
## $ nWordsDan: num 25462 25462 25462 25462 25462 ...
## $ nWordsFar: num 0 0 0 0 0 0 0 0 0 0 ...
## $ nUM : num 73 73 73 73 73 73 73 73 73 73 ...
## $ nUMDan : num 73 73 73 73 73 73 73 73 73 73 ...
## $ nUMFar : num 0 0 0 0 0 0 0 0 0 0 ...
## $ nUH : num 284 284 284 284 284 284 284 284 284 284 ...
## $ nUHDan : num 284 284 284 284 284 284 284 284 284 284 ...
## $ nUHFar : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age.z : num -0.794 -0.794 -0.794 -0.794 -0.794 ...
colnames(danfar)
## [1] "speaker" "age" "sex" "lang" "word"
## [6] "type" "UM" "nWords" "nWordsDan" "nWordsFar"
## [11] "nUM" "nUMDan" "nUMFar" "nUH" "nUHDan"
## [16] "nUHFar" "age.z"
# Danish/Faroese together
danfar$birthyear = 2007 - danfar$age # approximation as recording year ranges between 2005 and 2009
df1 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese")
df2 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese","umwords")
df3 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese","uhwords")
df4 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese","uhmwords")
multiplot(df1,df2,df4,df3,cols=2)
# Danish only
dan = droplevels(danfar[danfar$lang == 'dan',])
d1 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish")
d2 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish","umwords",nwordVar='nWordsDan')
d3 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish","uhwords",nwordVar='nWordsDan')
d4 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish","uhmwords",nwordVar='nWordsDan')
multiplot(d1,d2,d4,d3,cols=2)
# Faroese only
far = droplevels(danfar[danfar$lang == 'far',])
f1 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese")
f2 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese","umwords",nwordVar='nWordsFar')
f3 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese","uhwords",nwordVar='nWordsFar')
f4 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese","uhmwords",nwordVar='nWordsFar')
multiplot(f1,f2,f4,f3,cols=2)
# takes about 30 seconds
m <- glmer(UM ~ sex + age.z:lang + (1+lang|speaker), family='binomial', data=danfar)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: UM ~ sex + age.z:lang + (1 + lang | speaker)
## Data: danfar
##
## AIC BIC logLik deviance df.resid
## 3940.2 3985.0 -1963.1 3926.2 4497
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.1698 -0.4785 -0.3196 -0.1591 5.8215
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## speaker (Intercept) 0.2412 0.4912
## langfar 4.5529 2.1337 0.27
## Number of obs: 4504, groups: speaker, 57
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2888 0.1494 -8.624 < 2e-16 ***
## sexm -0.5876 0.2008 -2.927 0.00342 **
## age.z:langdan -0.4118 0.1010 -4.077 4.56e-05 ***
## age.z:langfar -0.5937 0.4276 -1.388 0.16504
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) sexm ag.z:lngd
## sexm -0.335
## age.z:lngdn 0.202 0.125
## age.z:lngfr -0.043 -0.086 0.169
# model performance
somers2(fitted(m), danfar$UM)
## C Dxy n Missing
## 0.7750354 0.5500709 4504.0000000 0.0000000
Older people and men are relatively less likely to use UM than younger people and women. The age effect is greater in the Faroese language than in the Danish language.
danfar2 = unique(danfar[,c("speaker","age","age.z","sex","nWords","nUM","nUH")])
dim(danfar2) # note the small number of speakers
## [1] 57 7
# average proportion UM as opposed to UH per speaker
mean(danfar2$nUM / (danfar2$nUM + danfar2$nUH), na.rm=T)
## [1] 0.1652995
# percentage UM of all words
mean(danfar2$nUM / danfar2$nWords)
## [1] 0.001991513
# percentage UH of all words
mean(danfar2$nUH / danfar2$nWords)
## [1] 0.007876947
# Older people are less likely to use UM
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ age.z + sex + (1|speaker), data=danfar2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ age.z + sex + (1 | speaker)
## Data: danfar2
##
## AIC BIC logLik deviance df.resid
## 403.1 411.3 -197.6 395.1 53
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.15879 -0.30769 0.04376 0.15900 0.52398
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 1.337 1.156
## Number of obs: 57, groups: speaker, 57
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.7956 0.2297 -29.587 <2e-16 ***
## age.z -0.4186 0.1714 -2.442 0.0146 *
## sexm -0.3271 0.3376 -0.969 0.3325
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z 0.082
## sexm -0.663 -0.085
# Use of UH is not influenced by age or gender
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ age.z + sex + (1|speaker), data=danfar2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ age.z + sex + (1 | speaker)
## Data: danfar2
##
## AIC BIC logLik deviance df.resid
## 559.8 568.0 -275.9 551.8 53
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.92994 -0.13737 0.00454 0.09219 0.19001
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.6666 0.8164
## Number of obs: 57, groups: speaker, 57
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.16269 0.15367 -33.60 <2e-16 ***
## age.z -0.09431 0.11436 -0.82 0.410
## sexm 0.04130 0.22512 0.18 0.854
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z 0.061
## sexm -0.683 -0.084
# Use of either UM or UH is not influenced by age or gender
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ age.z + sex + (1|speaker), data=danfar2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH + nUM, nWords - nUH - nUM) ~ age.z + sex + (1 | speaker)
## Data: danfar2
##
## AIC BIC logLik deviance df.resid
## 586.7 594.9 -289.4 578.7 53
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.94196 -0.09405 -0.00001 0.08330 0.15900
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.7473 0.8645
## Number of obs: 57, groups: speaker, 57
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.92061 0.16147 -30.474 <2e-16 ***
## age.z -0.15661 0.11990 -1.306 0.191
## sexm -0.06841 0.23672 -0.289 0.773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z 0.066
## sexm -0.684 -0.091
dan2 = unique(dan[,c("speaker","age","age.z","sex","nWordsDan","nUMDan","nUHDan")])
dim(dan2) # note the small number of speakers
## [1] 43 7
# average proportion UM as opposed to UH per speaker
mean(dan2$nUM / (dan2$nUM + dan2$nUH), na.rm=T)
## [1] 0.1952701
# percentage UM of all words
mean(dan2$nUM / dan2$nWords)
## [1] 0.003941396
# percentage UH of all words
mean(dan2$nUH / dan2$nWords)
## [1] 0.01605026
# Older people are less likely to use UM
summary(mUM <- glmer(cbind(nUMDan, nWordsDan - nUMDan) ~ age.z + sex + (1|speaker), data=dan2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUMDan, nWordsDan - nUMDan) ~ age.z + sex + (1 | speaker)
## Data: dan2
##
## AIC BIC logLik deviance df.resid
## 300.8 307.8 -146.4 292.8 39
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.0772 -0.3598 0.0117 0.2002 0.6453
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.4025 0.6344
## Number of obs: 43, groups: speaker, 43
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.7334 0.1475 -38.87 < 2e-16 ***
## age.z -0.4804 0.1128 -4.26 2.04e-05 ***
## sexm -0.2317 0.2210 -1.05 0.295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z 0.029
## sexm -0.659 0.009
# Men are more likely to use UH
summary(mUH <- glmer(cbind(nUHDan, nWordsDan - nUHDan) ~ age.z + sex + (1|speaker), data=dan2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUHDan, nWordsDan - nUHDan) ~ age.z + sex + (1 | speaker)
## Data: dan2
##
## AIC BIC logLik deviance df.resid
## 416.0 423.1 -204.0 408.0 39
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.72174 -0.20364 -0.01880 0.06879 0.39320
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.3193 0.5651
## Number of obs: 43, groups: speaker, 43
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.45102 0.12112 -36.75 <2e-16 ***
## age.z -0.08576 0.09132 -0.94 0.3477
## sexm 0.35532 0.18037 1.97 0.0488 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z -0.045
## sexm -0.670 0.012
# Older people tend to be less likely to use either UM or UH (marginally significant)
summary(mUHUM <- glmer(cbind(nUHDan + nUMDan, nWordsDan - nUHDan - nUMDan) ~ age.z + sex + (1|speaker), data=dan2,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUHDan + nUMDan, nWordsDan - nUHDan - nUMDan) ~ age.z +
## sex + (1 | speaker)
## Data: dan2
##
## AIC BIC logLik deviance df.resid
## 434.3 441.3 -213.1 426.3 39
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.71377 -0.16113 -0.02137 0.07914 0.35917
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.3161 0.5622
## Number of obs: 43, groups: speaker, 43
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.15765 0.11929 -34.85 <2e-16 ***
## age.z -0.17558 0.09021 -1.95 0.0516 .
## sexm 0.22770 0.17829 1.28 0.2015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z -0.041
## sexm -0.668 0.010
far2 = unique(far[,c("speaker","age","age.z","sex","nWordsFar","nUMFar","nUHFar")])
dim(far2) # note the small number of speakers)
## [1] 48 7
# average proportion UM as opposed to UH per speaker
mean(far2$nUM / (far2$nUM + far2$nUH), na.rm=T)
## [1] 0.09746815
# percentage UM of all words
mean(far2$nUM / far2$nWords)
## [1] 0.000429629
# percentage UH of all words
mean(far2$nUH / far2$nWords)
## [1] 0.002967625
# Older people are less likely to use UM
summary(mUM <- glmer(cbind(nUMFar, nWordsFar - nUMFar) ~ age.z + sex + (1|speaker), data=far2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUMFar, nWordsFar - nUMFar) ~ age.z + sex + (1 | speaker)
## Data: far2
##
## AIC BIC logLik deviance df.resid
## 141.5 149.0 -66.8 133.5 44
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.5537 -0.4422 -0.1977 0.2227 1.5478
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 3.334 1.826
## Number of obs: 48, groups: speaker, 48
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.4549 0.6339 -14.915 < 2e-16 ***
## age.z -1.1824 0.4452 -2.656 0.00791 **
## sexm -0.4776 0.7286 -0.656 0.51210
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z 0.322
## sexm -0.567 -0.080
# Use of UH is not influenced by age or gender
summary(mUH <- glmer(cbind(nUHFar, nWordsFar - nUHFar) ~ age.z + sex + (1|speaker), data=far2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUHFar, nWordsFar - nUHFar) ~ age.z + sex + (1 | speaker)
## Data: far2
##
## AIC BIC logLik deviance df.resid
## 332.8 340.3 -162.4 324.8 44
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.12895 -0.26198 -0.03189 0.17136 0.64055
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.6112 0.7818
## Number of obs: 48, groups: speaker, 48
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.15937 0.18492 -33.31 <2e-16 ***
## age.z -0.20662 0.13916 -1.48 0.138
## sexm 0.03106 0.25274 0.12 0.902
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z 0.085
## sexm -0.728 -0.098
# Older people are relatively less likely to use either UM or UH
summary(mUHUM <- glmer(cbind(nUHFar + nUMFar, nWordsFar - nUHFar - nUMFar) ~ age.z + sex + (1|speaker), data=far2,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUHFar + nUMFar, nWordsFar - nUHFar - nUMFar) ~ age.z +
## sex + (1 | speaker)
## Data: far2
##
## AIC BIC logLik deviance df.resid
## 344.3 351.8 -168.2 336.3 44
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.17897 -0.19065 -0.01544 0.19102 0.66997
##
## Random effects:
## Groups Name Variance Std.Dev.
## speaker (Intercept) 0.5977 0.7731
## Number of obs: 48, groups: speaker, 48
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.97027 0.18000 -33.17 <2e-16 ***
## age.z -0.28502 0.13573 -2.10 0.0357 *
## sexm -0.07555 0.24781 -0.30 0.7605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age.z
## age.z 0.105
## sexm -0.726 -0.112
download.file('http://www.let.rug.nl/wieling/uhum/Dutch/nl.rda','nl.rda')
load('nl.rda')
dim(nl)
## [1] 228619 44
str(nl)
## 'data.frame': 228619 obs. of 44 variables:
## $ wordid : Factor w/ 228619 levels "fn000001.12.7",..: 552 579 558 550 540 587 565 570 532 551 ...
## $ interviewid : Factor w/ 5181 levels "fn000001","fn000002",..: 15 19 15 15 14 19 15 16 12 15 ...
## $ speakerid : Factor w/ 3433 levels "N00003","N00005",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ word : Factor w/ 2 levels "uh","uhm": 1 2 1 1 1 1 1 1 1 1 ...
## $ prevword : Factor w/ 15014 levels "'k","'m","'n",..: 12639 4766 8683 12246 5782 12480 9029 7 14823 2695 ...
## $ prevPOS : Factor w/ 11 levels "ADJ","BW","LID",..: 6 11 10 7 10 11 10 3 11 2 ...
## $ nextword : Factor w/ 22438 levels "'k","'m","'ns",..: 4006 5296 10294 5296 8552 4609 7 12769 11134 19103 ...
## $ nextPOS : Factor w/ 11 levels "ADJ","BW","LID",..: 1 8 4 8 10 4 3 4 8 6 ...
## $ component : Factor w/ 15 levels "comp-a","comp-b",..: 12 12 12 12 12 12 12 9 9 12 ...
## $ phrasenumber : int 27 2 30 22 9 4 6 4 8 27 ...
## $ interviewlength: int 39 30 39 39 9 30 39 5 11 39 ...
## $ relsentpos : num 0.6923 0.0667 0.7692 0.5641 1 ...
## $ position : int 12 1 3 13 11 3 12 4 1 11 ...
## $ phraselength : int 19 9 13 31 21 22 23 12 9 19 ...
## $ relpos : num 0.632 0.111 0.231 0.419 0.524 ...
## $ isfirst : num 0 1 0 0 0 0 0 0 1 0 ...
## $ islast : num 0 0 0 0 0 0 0 0 0 0 ...
## $ duration : num 0.088 0.475 0.091 0.051 0.071 ...
## $ pausebefore : num 0.254 0.076 0 0.106 0 ...
## $ pauseafter : num 0.182 0.222 0.334 0.212 0 ...
## $ recyear : int 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
## $ preparedness : int 3 3 3 3 3 3 3 1 1 3 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
## $ birthyear : int 1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 ...
## $ age : int 22 22 22 22 22 22 22 22 22 22 ...
## $ edulevel : num 9 9 9 9 9 9 9 9 9 9 ...
## $ wps : num 3.49 3.49 3.49 3.49 3.49 ...
## $ country : Factor w/ 2 levels "N","V": 1 1 1 1 1 1 1 1 1 1 ...
## $ UM : num 0 1 0 0 0 0 0 0 0 0 ...
## $ nWords : int 2235 2235 2235 2235 2235 2235 2235 2235 2235 2235 ...
## $ nUM : num 3 3 3 3 3 3 3 3 3 3 ...
## $ nUH : num 65 65 65 65 65 65 65 65 65 65 ...
## $ relsentpos.z : num 0.632 -1.487 0.892 0.198 1.674 ...
## $ relpos.z : num 0.36973 -1.37034 -0.97029 -0.3398 0.00942 ...
## $ duration.z : num -0.72 1.562 -0.702 -0.938 -0.82 ...
## $ pausebefore.z : num 0.148 -0.244 -0.411 -0.178 -0.411 ...
## $ pauseafter.z : num -0.171 -0.09 0.135 -0.11 -0.537 ...
## $ recyear.z : num -0.99 -0.99 -0.99 -0.99 -0.99 ...
## $ preparedness.z : num 1.66 1.66 1.66 1.66 1.66 ...
## $ birthyear.z : num 1.16 1.16 1.16 1.16 1.16 ...
## $ age.z : num -1.33 -1.33 -1.33 -1.33 -1.33 ...
## $ edulevel.z : num 0.0576 0.0576 0.0576 0.0576 0.0576 ...
## $ nWords.z : num 0.0382 0.0382 0.0382 0.0382 0.0382 ...
## $ wps.z : num -0.0542 -0.0542 -0.0542 -0.0542 -0.0542 ...
colnames(nl)
## [1] "wordid" "interviewid" "speakerid"
## [4] "word" "prevword" "prevPOS"
## [7] "nextword" "nextPOS" "component"
## [10] "phrasenumber" "interviewlength" "relsentpos"
## [13] "position" "phraselength" "relpos"
## [16] "isfirst" "islast" "duration"
## [19] "pausebefore" "pauseafter" "recyear"
## [22] "preparedness" "sex" "birthyear"
## [25] "age" "edulevel" "wps"
## [28] "country" "UM" "nWords"
## [31] "nUM" "nUH" "relsentpos.z"
## [34] "relpos.z" "duration.z" "pausebefore.z"
## [37] "pauseafter.z" "recyear.z" "preparedness.z"
## [40] "birthyear.z" "age.z" "edulevel.z"
## [43] "nWords.z" "wps.z"
# relative proportion of UM in the Netherlands
mean(na.exclude(nl[nl$country=='N',]$UM))
## [1] 0.08507495
# relative proportion of UM in Flanders
mean(na.exclude(nl[nl$country=='V',]$UM))
## [1] 0.1577117
# average duration of UM in the Netherlands
mean(na.exclude(nl[nl$country=='N' & nl$word=='uhm',]$duration))
## [1] 0.3073129
# average duration of UM in Flanders
mean(na.exclude(nl[nl$country=='V' & nl$word=='uhm',]$duration))
## [1] 0.5231677
n1 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch")
n2 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch","umwords")
n3 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch","uhwords")
n4 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch","uhmwords")
multiplot(n1,n2,n4,n3,cols=2)
n5 = visualizeUHUM(nl,"speakerid","recyear","sex","Dutch")
n5
# duration: about 30 minutes
m <- glmer(UM ~ country + country:sex + country:age.z + preparedness.z + edulevel.z + isfirst + recyear.z +
islast + relpos.z + relsentpos.z + duration.z + pausebefore.z + pauseafter.z +
(1|speakerid) + (1|interviewid), data=nl, family='binomial',
control=glmerControl(optimizer="bobyqa"))
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: UM ~ country + country:sex + country:age.z + preparedness.z +
## edulevel.z + isfirst + recyear.z + islast + relpos.z + relsentpos.z +
## duration.z + pausebefore.z + pauseafter.z + (1 | speakerid) +
## (1 | interviewid)
## Data: nl
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 83530.9 83713.3 -41747.5 83494.9 185109
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2191.67 -0.26 -0.15 -0.08 33.82
##
## Random effects:
## Groups Name Variance Std.Dev.
## interviewid (Intercept) 0.2112 0.4596
## speakerid (Intercept) 1.2745 1.1289
## Number of obs: 185127, groups: interviewid, 4670; speakerid, 2574
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.676224 0.065678 -40.75 < 2e-16 ***
## countryV -0.605820 0.092985 -6.52 7.26e-11 ***
## preparedness.z 0.277940 0.037512 7.41 1.27e-13 ***
## edulevel.z 0.147628 0.030573 4.83 1.37e-06 ***
## isfirst 0.512852 0.029647 17.30 < 2e-16 ***
## recyear.z 0.094139 0.025874 3.64 0.000274 ***
## islast 0.957705 0.039419 24.30 < 2e-16 ***
## relpos.z -0.347087 0.013536 -25.64 < 2e-16 ***
## relsentpos.z -0.069232 0.009602 -7.21 5.58e-13 ***
## duration.z 1.154341 0.010951 105.41 < 2e-16 ***
## pausebefore.z 0.170756 0.008172 20.90 < 2e-16 ***
## pauseafter.z 0.470273 0.008348 56.34 < 2e-16 ***
## countryN:sexmale -0.495872 0.077098 -6.43 1.26e-10 ***
## countryV:sexmale -0.890460 0.097126 -9.17 < 2e-16 ***
## countryN:age.z -0.280348 0.035951 -7.80 6.28e-15 ***
## countryV:age.z -0.623677 0.052028 -11.99 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cntryV prprd. edlvl. isfrst rcyr.z islast rlps.z rlsnt.
## countryV -0.478
## preprdnss.z 0.493 0.060
## edulevel.z 0.051 -0.198 -0.170
## isfirst -0.037 -0.016 -0.014 0.003
## recyear.z 0.010 -0.024 0.248 0.083 -0.004
## islast -0.074 0.005 0.022 0.005 -0.300 0.001
## relpos.z 0.065 -0.004 0.002 0.000 0.413 -0.010 -0.696
## relsentps.z -0.007 -0.004 -0.025 0.001 0.010 -0.005 -0.012 -0.001
## duration.z -0.033 -0.133 0.026 -0.004 -0.075 0.011 -0.002 0.002 -0.004
## pausebefr.z -0.004 0.014 -0.007 -0.001 -0.416 0.001 0.040 -0.040 -0.006
## pauseaftr.z -0.022 0.011 0.035 0.016 0.029 0.005 -0.102 -0.028 -0.018
## cntryN:sxml -0.661 0.434 -0.078 -0.105 -0.011 0.131 0.004 -0.003 0.002
## cntryV:sxml -0.074 -0.565 -0.127 0.000 -0.004 0.048 -0.001 0.003 0.003
## contryN:g.z 0.074 -0.075 -0.084 -0.034 0.010 -0.010 0.003 0.008 -0.004
## contryV:g.z -0.057 0.117 -0.120 0.028 0.008 -0.008 0.001 0.006 -0.005
## drtn.z psbfr. psftr. cntrN: cntrV: cntN:.
## countryV
## preprdnss.z
## edulevel.z
## isfirst
## recyear.z
## islast
## relpos.z
## relsentps.z
## duration.z
## pausebefr.z -0.027
## pauseaftr.z 0.019 0.029
## cntryN:sxml -0.017 -0.002 -0.012
## cntryV:sxml -0.015 -0.001 -0.015 0.029
## contryN:g.z -0.041 0.000 0.009 -0.135 0.014
## contryV:g.z -0.041 -0.003 -0.006 0.014 -0.097 0.013
# model performance, m@frame contains all data with non-missing values
somers2(fitted(m), m@frame$UM)
## C Dxy n Missing
## 9.094131e-01 8.188262e-01 1.851270e+05 0.000000e+00
Men, both in the Netherlands and in Flanders, are less likely than women to use UM (this gender effect is greatest in Flanders). Younger people, both in the Netherlands and in Flanders, are more likely to use UM than older people (this age effect is strongest in Flanders). People recorded later are more likely to prefer UM. While the model summary suggests (due to the significance of the country predictor) that women of average age in Flanders are less likely to use UM than those in the Netherlands, this is caused by the inclusion of the predictor modeling the duration of the hesitation marker (which is higher in Flanders). When duration is excluded from the model, women of average age in Flanders have a greater likelihood of using UM than those in the Netherlands. The other two subject-related predictors indicate that people with a higher education level have a greater likelihood of using UM than those with a lower education level. People who prepared what they were going to say (e.g., holding a speech versus spontaneous speech) were more likely to use UM than those with less preparation. The other predictors deal with possible different functions between UH and UM. When the position of the hesitation marker is at the beginning or at the end of the utterance, it is more likely to be UM rather than UH. Additionally, if the hesitation marker occurs later in the utterance it is less likely to be UM as opposed to UH. Whenever the utterance occurs later in the interview / audio recording, it is also less likely to contain UM. Finally, whenever the hesitation marker has longer duration, is preceded or followed by a longer pause (i.e. silence), it is more likely to be UM rather than UH.
Given the complexity of this model (the random-intercept-only model took about half an hour to fit, adding several random slopes would increase the required time to fit considerably), we fitted a series of models which only differed from the simple model above by the inclusion of a single random slope (per speaker or interview). In this way, we assessed if the inclusion of each predictor in the model as a (by-speaker or by-interview) random slope caused another predictor to become non-significant. The model reported above only contains those predictors which remained significant in all models. Note that there were a few additional interactions with country and non-speaker related predictors. These were not included in the model above as the direction of the effects was always the same in both countries and of secondary importance.
nl2 = unique(nl[,c("speakerid","sex","birthyear","age","edulevel","wps","country","nWords","nUM","nUH","birthyear.z","age.z","edulevel.z","wps.z")])
dim(nl2)
## [1] 3727 14
# average proportion UM as opposed to UH per speaker
mean(nl2$nUM / (nl2$nUM + nl2$nUH), na.rm=T)
## [1] 0.1085622
# percentage UM of all words
mean(nl2$nUM / nl2$nWords)
## [1] 0.003704241
# percentage UH of all words
mean(nl2$nUH / nl2$nWords)
## [1] 0.03149794
# People from Flanders, men, older people and people who speak faster are less likely to use UM.
# People with higher education are more likely to use UM.
# (adding an interaction between country and sex or birthyear caused non-convergence,
# other interactions were not tested here)
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ country + sex + age.z +
edulevel.z + wps.z + (1|speakerid),
data=nl2, family='binomial', control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ country + sex + age.z + edulevel.z +
## wps.z + (1 | speakerid)
## Data: nl2
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 14800.0 14841.7 -7393.0 14786.0 2855
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7084 -0.4495 -0.0133 0.1839 4.9581
##
## Random effects:
## Groups Name Variance Std.Dev.
## speakerid (Intercept) 1.29 1.136
## Number of obs: 2862, groups: speakerid, 2604
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.06992 0.04718 -128.67 < 2e-16 ***
## countryV -0.27419 0.05827 -4.71 2.53e-06 ***
## sexmale -0.31226 0.05478 -5.70 1.20e-08 ***
## age.z -0.29043 0.02752 -10.55 < 2e-16 ***
## edulevel.z 0.15902 0.02789 5.70 1.18e-08 ***
## wps.z -0.17578 0.03779 -4.65 3.31e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cntryV sexmal age.z edlvl.
## countryV -0.507
## sexmale -0.614 0.001
## age.z 0.053 0.110 -0.129
## edulevel.z 0.134 -0.165 -0.140 -0.023
## wps.z -0.238 0.281 -0.005 0.258 0.138
# Women in Flanders having an average age are less likely to use UH than women in the Netherlands of average age
# Men, both in Flanders and the Netherlands (more so in the former) are more likely to use UH than women.
# Older people in Flanders (but not in the Netherlands) are more likely to use UH.
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ country + country:sex + country:age.z +
(1|speakerid), data=nl2, family='binomial',
control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH, nWords - nUH) ~ country + country:sex + country:age.z +
## (1 | speakerid)
## Data: nl2
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 28049.5 28091.9 -14017.8 28035.5 3154
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.70167 -0.10814 0.01579 0.10781 2.20660
##
## Random effects:
## Groups Name Variance Std.Dev.
## speakerid (Intercept) 0.6359 0.7974
## Number of obs: 3161, groups: speakerid, 2867
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.67763 0.03046 -120.75 < 2e-16 ***
## countryV -0.75052 0.05119 -14.66 < 2e-16 ***
## countryN:sexmale 0.21013 0.04040 5.20 1.98e-07 ***
## countryV:sexmale 0.40182 0.05370 7.48 7.29e-14 ***
## countryN:age.z -0.02619 0.01862 -1.41 0.16
## countryV:age.z 0.14335 0.02590 5.53 3.14e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cntryV cntrN: cntrV: cntN:.
## countryV -0.594
## cntryN:sxml -0.761 0.453
## cntryV:sxml 0.000 -0.620 0.000
## contryN:g.z 0.109 -0.064 -0.155 0.000
## contryV:g.z 0.001 0.083 0.000 -0.119 0.001
# People from Flanders of average age are less likely to use either UM or UH than people from the Netherlands
# of average age. Men are more likely to use eigher UM or UH than women. Older people in the Netherlands are
# less likely to use either UM or UH, while the pattern is reversed in Flanders.
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ country + sex +
country:age.z + (1|speakerid),
data=nl2, family='binomial', control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## cbind(nUH + nUM, nWords - nUH - nUM) ~ country + sex + country:age.z +
## (1 | speakerid)
## Data: nl2
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 28721.1 28757.4 -14354.5 28709.1 3155
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.75429 -0.09778 0.01616 0.10274 2.05496
##
## Random effects:
## Groups Name Variance Std.Dev.
## speakerid (Intercept) 0.6243 0.7901
## Number of obs: 3161, groups: speakerid, 2867
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.57416 0.02669 -133.91 < 2e-16 ***
## countryV -0.57363 0.03235 -17.73 < 2e-16 ***
## sexmale 0.21922 0.03182 6.89 5.6e-12 ***
## countryN:age.z -0.05379 0.01825 -2.95 0.00320 **
## countryV:age.z 0.08089 0.02519 3.21 0.00132 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cntryV sexmal cntN:.
## countryV -0.429
## sexmale -0.683 -0.015
## contryN:g.z 0.079 0.008 -0.125
## contryV:g.z 0.050 0.020 -0.073 0.010
download.file('http://www.let.rug.nl/wieling/uhum/TwitterUS/twitterUS.rda','twitterUS.rda')
load('twitterUS.rda')
dim(twitterUSdat)
## [1] 25858 5
str(twitterUSdat)
## 'data.frame': 25858 obs. of 5 variables:
## $ User : Factor w/ 25858 levels "______Liz","______xMarie",..: 666 667 668 669 670 671 672 665 673 676 ...
## $ Gender: Factor w/ 2 levels "F","M": 2 1 2 2 2 2 1 2 1 1 ...
## $ nWords: int 1220 4041 8733 6511 1833 2418 11901 2960 5933 25096 ...
## $ nUM : int 1 2 6 0 0 0 2 1 1 6 ...
## $ nUH : int 0 0 1 4 1 1 1 1 0 7 ...
colnames(twitterUSdat)
## [1] "User" "Gender" "nWords" "nUM" "nUH"
us1 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","umuh")
us2 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","umwords")
us3 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","uhwords")
us4 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","uhmwords")
multiplot(us1,us2,us4,us3,cols=2)
# takes about 30 seconds
m <- glmer(cbind(nUM,nUH) ~ Gender + (1|User), family='binomial', data=twitterUSdat)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nUH) ~ Gender + (1 | User)
## Data: twitterUSdat
##
## AIC BIC logLik deviance df.resid
## 56168.1 56192.5 -28081.0 56162.1 25855
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.9977 -0.5720 0.0330 0.4969 0.9076
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 6.256 2.501
## Number of obs: 25858, groups: User, 25858
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.42504 0.03085 13.778 < 2e-16 ***
## GenderM -0.26477 0.04421 -5.989 2.11e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## GenderM -0.692
# model performance
somers2( fitted(m), (twitterUSdat$nUM > twitterUSdat$nUH)*1 )
## C Dxy n Missing
## 9.999897e-01 9.999794e-01 2.585800e+04 0.000000e+00
somers2( fitted(m), twitterUSdat$nUM / (twitterUSdat$nUM + twitterUSdat$nUH) )
## C Dxy n Missing
## 9.871825e-01 9.743650e-01 2.585800e+04 0.000000e+00
Men are relatively less likely to use UM than women.
# average proportion UM as opposed to UH per speaker
mean(twitterUSdat$nUM / (twitterUSdat$nUM + twitterUSdat$nUH), na.rm=T)
## [1] 0.5334387
# percentage UM of all words
mean(twitterUSdat$nUM / twitterUSdat$nWords)
## [1] 0.0002527892
# percentage UH of all words
mean(twitterUSdat$nUH / twitterUSdat$nWords)
## [1] 0.0001878946
# Men are relatively less likely than women to use UM.
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Gender + (1|User),
data=twitterUSdat,family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Gender + (1 | User)
## Data: twitterUSdat
##
## AIC BIC logLik deviance df.resid
## 85852.8 85877.3 -42923.4 85846.8 25855
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.36025 -0.52782 0.00306 0.50772 1.83745
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 1.346 1.16
## Number of obs: 25858, groups: User, 25858
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.07547 0.01455 -623.6 <2e-16 ***
## GenderM -0.27112 0.02042 -13.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## GenderM -0.653
# Men are relatively less likely than women to use UH.
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Gender + (1|User), data=twitterUSdat,
family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Gender + (1 | User)
## Data: twitterUSdat
##
## AIC BIC logLik deviance df.resid
## 75758.6 75783.1 -37876.3 75752.6 25855
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6775 -0.5207 -0.1841 0.5154 2.3350
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 1.115 1.056
## Number of obs: 25858, groups: User, 25858
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.31611 0.01480 -629.5 < 2e-16 ***
## GenderM -0.14096 0.02012 -7.0 2.43e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## GenderM -0.660
# Men are relatively less likely than women to use either UM or UH.
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + (1|User),
data=twitterUSdat, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + (1 | User)
## Data: twitterUSdat
##
## AIC BIC logLik deviance df.resid
## 106780.8 106805.3 -53387.4 106774.8 25855
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2491 -0.3029 0.1101 0.5508 2.7342
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 0.5664 0.7526
## Number of obs: 25858, groups: User, 25858
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.204205 0.009272 -884.8 <2e-16 ***
## GenderM -0.203652 0.013438 -15.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## GenderM -0.686
download.file('http://www.let.rug.nl/wieling/uhum/TwitterNL/twitterNL.rda','twitterNL.rda')
load('twitterNL.rda') # Dutch Twitter data collected between 2011 and 2014
dim(twitterNL)
## [1] 38651 7
str(twitterNL)
## 'data.frame': 38651 obs. of 7 variables:
## $ User : Factor w/ 189907 levels "_______1992",..: 155487 62707 111648 14 15 111651 155492 111654 62710 26 ...
## $ BirthYear : num 1998 1993 1995 1990 1991 ...
## $ Gender : Factor w/ 2 levels "female","male": NA 2 NA 2 NA 2 1 NA NA NA ...
## $ nWords : int 2148 17699 11049 8854 182024 1839 1590 6271 1706 3120 ...
## $ nUM : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nUH : int 0 1 0 0 0 0 0 1 0 0 ...
## $ BirthYear.z: num 0.828 0.488 0.624 0.284 0.352 ...
colnames(twitterNL)
## [1] "User" "BirthYear" "Gender" "nWords" "nUM"
## [6] "nUH" "BirthYear.z"
tw1 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","umuh","nUM")
tw2 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","umwords","nUM")
tw3 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","uhwords","nUM")
tw4 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","uhmwords","nUM")
multiplot(tw1,tw2,tw4,tw3,cols=2)
# takes about 1 minute: Gender not significant
m <- glmer(cbind(nUM,nUH) ~ Gender + BirthYear.z + (1|User), data=twitterNL, family='binomial')
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nUH) ~ Gender + BirthYear.z + (1 | User)
## Data: twitterNL
##
## AIC BIC logLik deviance df.resid
## 13453.9 13484.2 -6723.0 13445.9 14377
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.8854 0.0000 0.0000 0.0000 0.8053
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 7.258 2.694
## Number of obs: 14381, groups: User, 14381
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.12375 0.08952 12.553 <2e-16 ***
## Gendermale -0.15102 0.09870 -1.530 0.126
## BirthYear.z 0.85995 0.09394 9.154 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Gndrml
## Gendermale -0.606
## BirthYear.z -0.487 0.015
# model performance
twnl = twitterNL[!is.na(twitterNL$Gender) & !is.na(twitterNL$BirthYear.z),] # exclude missing values
somers2( fitted(m), (twnl$nUM > twnl$nUH)*1 )
## C Dxy n Missing
## 9.024339e-01 8.048679e-01 1.438100e+04 0.000000e+00
somers2( fitted(m), twnl$nUM / (twnl$nUM + twnl$nUH) )
## C Dxy n Missing
## 0.9749681 0.9499362 5321.0000000 9060.0000000
Younger people are relatively more likely to use UM than older people. Men tend to be relatively less likely to use UM than women, though not significantly so.
# average proportion UM as opposed to UH per speaker
mean(twitterNL$nUM / (twitterNL$nUM + twitterNL$nUH), na.rm=T)
## [1] 0.65184
# percentage UM of all words
mean(twitterNL$nUM / twitterNL$nWords)
## [1] 0.0001069555
# percentage UH of all words
mean(twitterNL$nUH / twitterNL$nWords)
## [1] 5.501112e-05
# Men are relatively less likely than women to use UM and younger people
# are relatively more likely to use UM than older people.
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Gender + BirthYear.z + (1|User),
data=twitterNL,family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Gender + BirthYear.z + (1 | User)
## Data: twitterNL
##
## AIC BIC logLik deviance df.resid
## 30198.2 30228.5 -15095.1 30190.2 14377
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.9744 -0.3544 -0.2373 0.1244 3.1627
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 2.527 1.59
## Number of obs: 14381, groups: User, 14381
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.32794 0.04190 -246.51 <2e-16 ***
## Gendermale -0.36494 0.04157 -8.78 <2e-16 ***
## BirthYear.z 0.81890 0.03672 22.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Gndrml
## Gendermale -0.525
## BirthYear.z -0.479 0.004
# Men are relatively less likely than women to use UH and younger people
# are relatively more likely to use UH than older people.
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Gender + BirthYear.z + (1|User),
data=twitterNL, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Gender + BirthYear.z + (1 | User)
## Data: twitterNL
##
## AIC BIC logLik deviance df.resid
## 20115.0 20145.3 -10053.5 20107.0 14377
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.6742 -0.2228 -0.1504 -0.1000 2.5086
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 4.553 2.134
## Number of obs: 14381, groups: User, 14381
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.51287 0.06698 -171.88 < 2e-16 ***
## Gendermale -0.26664 0.05880 -4.53 5.77e-06 ***
## BirthYear.z 0.38734 0.04561 8.49 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Gndrml
## Gendermale -0.477
## BirthYear.z -0.328 0.016
# Men are relatively less likely than women to use either UM or UH and younger people
# are relatively more likely to use either UM or UH than older people.
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + BirthYear.z +
(1|User), data=twitterNL, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + BirthYear.z +
## (1 | User)
## Data: twitterNL
##
## AIC BIC logLik deviance df.resid
## 37874.3 37904.6 -18933.2 37866.3 14377
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.1563 -0.4324 -0.2895 0.2192 5.0039
##
## Random effects:
## Groups Name Variance Std.Dev.
## User (Intercept) 1.869 1.367
## Number of obs: 14381, groups: User, 14381
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.57543 0.03196 -299.63 <2e-16 ***
## Gendermale -0.32884 0.03407 -9.65 <2e-16 ***
## BirthYear.z 0.62213 0.02799 22.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Gndrml
## Gendermale -0.582
## BirthYear.z -0.442 0.011
To replicate the analysis presented above, you can just copy the following lines to the most recent version of R. Please note you also need to install Pandoc.
packages <- c("lme4","Hmisc","ggplot2","rmarkdown")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())))
}
download.file('http://www.let.rug.nl/wieling/uhum/analysis.Rmd', 'analysis.Rmd')
library(rmarkdown)
render('analysis.Rmd') # generates html file with results
browseURL(paste('file://', file.path(getwd(),'analysis.html'), sep='')) # shows result