## Generated on: November 23, 2015 - 15:08:23

Required libraries

# load required packages
library(lme4)
library(Hmisc)
library(ggplot2)

# custom plotting functions
download.file('http://www.let.rug.nl/wieling/uhum/plotting.R','plotting.R')
source('plotting.R')

# version information
R.version.string
## [1] "R version 3.2.2 (2015-08-14)"
packageVersion('lme4')
## [1] '1.1.9'
packageVersion('Hmisc')
## [1] '3.17.0'
packageVersion('ggplot2')
## [1] '1.0.1'

Results

For each dataset, we first visualize the relationship between UM and UH by dividing the speakers in four (about) equally sized groups on the basis of their year of birth (not for the U.S. Twitter data, as age information is not available). We further subdivide these four groups on the basis of the gender of the speakers. We then visualize the relationship between age and gender on the use of UM (as proportion of the total use of UH and UM), and if present, the relationship between year of recording and gender on the use of UM. If the number of words per speaker is available in the dataset, we also separately visualize the relationship between age and gender and the use of UM (as proportion of all words pronounced by the speaker) and UH (as proportion of all words pronounced by the speaker). In that case, we also visualize the relationship between age and gender and the use of any of the two hesitation markers (as proportion of all words pronounced by the speaker).

In the second step, we fit a logistic mixed-effects regression model predicting the occurrence of UM as opposed to UH. Consequently, in interpreting the model summaries below, positive estimates indicate an increased probability of using UM as opposed to UH, while negative estimates indicate the opposite. For each model the optimal random- and fixed-effects structure was determined via AIC comparisons. Only the final model for each data set is shown.

Finally, for the datasets for which the number of words per speaker was available, we fitted the following three logistic mixed-effects regression models: (1) predicting the occurrence of UM (as proportion of the total number of words), (2) predicting the occurrence of UH (as proportion of the total number of words), and (3) predicting the occurrecne of either UH or UM (as proportion of the total number of words). For the U.S. Twitter dataset we also show the geographical pattern.

American English: Switchboard

download.file('http://www.let.rug.nl/wieling/uhum/English/Switchboard/sb.rda','sb.rda')
load('sb.rda')

dim(sb)
## [1] 91001    35
str(sb)
## 'data.frame':    91001 obs. of  35 variables:
##  $ CallerID     : Factor w/ 520 levels "1000","1001",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age          : int  37 37 37 37 37 37 37 37 37 37 ...
##  $ Sex          : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Call         : Factor w/ 4844 levels "2001a","2001b",..: 231 25 95 301 63 231 335 131 25 95 ...
##  $ CallID       : Factor w/ 2438 levels "2001","2005",..: 116 13 48 151 32 116 168 66 13 48 ...
##  $ WordCount    : int  748 561 397 491 400 748 673 477 561 397 ...
##  $ CallPos      : int  327 11 250 601 5 581 361 281 412 211 ...
##  $ PhrasePos    : int  1 2 2 2 1 1 1 1 1 2 ...
##  $ BPhrasePos   : int  2 1 8 1 10 5 1 1 7 2 ...
##  $ PLength      : int  2 2 9 2 10 5 1 1 7 3 ...
##  $ FirstInPhrase: num  1 0 0 0 1 1 1 1 1 0 ...
##  $ LastInPhrase : num  0 1 0 1 0 0 1 1 0 0 ...
##  $ Duration     : num  0.493 0.433 0.316 0.28 0.318 ...
##  $ Word         : Factor w/ 2 levels "uh","um": 2 2 1 1 2 1 2 2 1 1 ...
##  $ Calls        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Region       : Factor w/ 9 levels "MIXED","NEW_ENGLAND",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ NextDuration : num  0.0924 0.4256 0.9936 0.5747 0.08 ...
##  $ NextWord     : Factor w/ 5404 levels "-[US]SR","-[company]'s",..: 1129 1130 5382 1130 2898 2898 1130 1130 5265 3610 ...
##  $ PrevDuration : num  0.46 0.309 0.292 0.26 0.388 ...
##  $ PrevWord     : Factor w/ 3578 levels "-[a]head","-[a]nd",..: 496 607 607 833 496 496 496 496 496 833 ...
##  $ PrevSilence  : num  0.46 0 0 0 0.388 ...
##  $ NextSilence  : num  0 0.426 0 0.575 0 ...
##  $ UM           : num  1 1 0 0 1 0 1 1 0 0 ...
##  $ nWords       : int  5236 5236 5236 5236 5236 5236 5236 5236 5236 5236 ...
##  $ nUM          : num  16 16 16 16 16 16 16 16 16 16 ...
##  $ nUH          : num  30 30 30 30 30 30 30 30 30 30 ...
##  $ Age.z        : num [1:91001, 1] 0.0232 0.0232 0.0232 0.0232 0.0232 ...
##  $ PhrasePos.z  : num  -0.675 -0.47 -0.47 -0.47 -0.675 ...
##  $ BPhrasePos.z : num  -0.421 -0.599 0.645 -0.599 1 ...
##  $ Duration.z   : num  0.999 0.617 -0.121 -0.351 -0.109 ...
##  $ NextSilence.z: num  -0.861 0.763 -0.861 1.332 -0.861 ...
##  $ PrevSilence.z: num  1.225 -0.641 -0.641 -0.641 0.934 ...
##  $ CallPos.z    : num  -0.319 -1.22 -0.539 0.461 -1.237 ...
##  $ PLength.z    : num  -0.775 -0.775 0.181 -0.775 0.317 ...
##  $ nWords.z     : num [1:91001, 1] -0.187 -0.187 -0.187 -0.187 -0.187 ...
colnames(sb)
##  [1] "CallerID"      "Age"           "Sex"           "Call"         
##  [5] "CallID"        "WordCount"     "CallPos"       "PhrasePos"    
##  [9] "BPhrasePos"    "PLength"       "FirstInPhrase" "LastInPhrase" 
## [13] "Duration"      "Word"          "Calls"         "Region"       
## [17] "NextDuration"  "NextWord"      "PrevDuration"  "PrevWord"     
## [21] "PrevSilence"   "NextSilence"   "UM"            "nWords"       
## [25] "nUM"           "nUH"           "Age.z"         "PhrasePos.z"  
## [29] "BPhrasePos.z"  "Duration.z"    "NextSilence.z" "PrevSilence.z"
## [33] "CallPos.z"     "PLength.z"     "nWords.z"
  • CallerID – ID of the caller
  • Age – Age of the speaker (in years)
  • Sex – Gender of the speaker (F: Female, M: Male)
  • Call – ID of the side of the call
  • CallID – ID of the call
  • WordCount – Word count per call side
  • CallPos – Position of the word in the call side
  • PhrasePos – Count from the start of the phrase, where “phrase” is “sequence between silences”
  • BPhrasePos – Count from the end of the phrase
  • PLength – Length of the current phrase
  • FirstInPhrase – The first word in the phrase is marked with 1, otherwise 0
  • LastInPhrase – The last word in the phrase is marked with 1, otherwise 0
  • Duration – Duration of the hesitation marker (in seconds)
  • Word – Hesitation marker
  • Calls – Number of calls of the speaker
  • Region – The region from which the caller originates
  • NextDuration – The duration of the word following the hesitation marker
  • NextWord – The word following the hesitation marker (may be equal to [silence], etc. )
  • PrevDuration – The duration of the word preceding the hesitation marker
  • PrevWord – The word preceding the hesitation marker (may be equal to [silence], etc. )
  • PrevSilence – The amount of silence preceding the hesitation marker (in sec.)
  • NextSilence – The amount of silence following the hesitation marker (in sec.)
  • UM – 1 when the hesitation marker is UM, 0 if it equals UH
  • nWords – Total number of words per speaker
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • .z – Z-transformation of the numerical predictors
sb$BirthYear = 1990 - sb$Age
s1 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard")
s2 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard","umwords")
s3 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard","uhwords")
s4 = visualizeUHUM(sb,"CallerID","BirthYear","Sex","Switchboard","uhmwords")
multiplot(s1,s2,s4,s3,cols=2)

Final model

# the model is not fitted here but loaded, as it takes more than one hour to fit
m <- glmer(UM ~ Age.z + Sex + FirstInPhrase + Age.z*LastInPhrase + Duration.z + PrevSilence.z + 
                Age.z*NextSilence.z + (1+Sex|CallID) + (0+Age.z|CallID) + 
                (1+NextSilence.z+PrevSilence.z+Duration.z+FirstInPhrase+LastInPhrase|CallerID),
           data=sb, family='binomial', 
           control=glmerControl(optCtrl=list(maxfun=1e5),optimizer="bobyqa"))
save(m, file = 'm-sb.rda')
download.file('http://www.let.rug.nl/wieling/uhum/English/Switchboard/m-sb.rda','m-sb.rda')
load('m-sb.rda')
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## UM ~ Age.z + Sex + FirstInPhrase + LastInPhrase * Age.z + Duration.z +  
##     PrevSilence.z + NextSilence.z * Age.z + (1 + NextSilence.z +  
##     PrevSilence.z + Duration.z + FirstInPhrase + LastInPhrase |  
##     CallerID) + (1 + Sex | CallID) + (0 + Age.z | CallID)
##    Data: sb
## Control: 
## glmerControl(optCtrl = list(maxfun = 1e+05), optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  63207.3  63537.0 -31568.7  63137.3    90966 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -94.793  -0.364  -0.168  -0.038  19.234 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr                         
##  CallID   Age.z         0.12008  0.3465                                
##  CallID.1 (Intercept)   0.56653  0.7527                                
##           SexM          0.46642  0.6829   -0.51                        
##  CallerID (Intercept)   2.22923  1.4931                                
##           NextSilence.z 0.02800  0.1673   -0.23                        
##           PrevSilence.z 0.03641  0.1908   -0.29  0.53                  
##           Duration.z    0.22828  0.4778    0.65 -0.19 -0.11            
##           FirstInPhrase 0.48557  0.6968   -0.62 -0.02  0.13 -0.50      
##           LastInPhrase  0.23523  0.4850   -0.50  0.08  0.13 -0.47  0.31
## Number of obs: 91001, groups:  CallID, 2438; CallerID, 520
## 
## Fixed effects:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.04340    0.10054 -20.324  < 2e-16 ***
## Age.z               -0.65581    0.06197 -10.583  < 2e-16 ***
## SexM                -1.03184    0.11481  -8.988  < 2e-16 ***
## FirstInPhrase        0.67146    0.05402  12.430  < 2e-16 ***
## LastInPhrase         1.05908    0.04514  23.461  < 2e-16 ***
## Duration.z           0.87064    0.02775  31.374  < 2e-16 ***
## PrevSilence.z        0.11561    0.02131   5.424 5.83e-08 ***
## NextSilence.z        0.11295    0.01870   6.041 1.53e-09 ***
## Age.z:LastInPhrase   0.09569    0.04529   2.113  0.03460 *  
## Age.z:NextSilence.z  0.05878    0.01938   3.032  0.00243 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Age.z  SexM   FrstIP LstInP Drtn.z PrvSl. NxtSl. A.:LIP
## Age.z       -0.083                                                        
## SexM        -0.624  0.147                                                 
## FirstInPhrs -0.434  0.005  0.044                                          
## LastInPhras -0.404 -0.007  0.013  0.122                                   
## Duration.z   0.378 -0.012 -0.038 -0.294 -0.251                            
## PrevSilnc.z  0.026 -0.001 -0.012 -0.508  0.050 -0.057                     
## NextSilnc.z  0.048 -0.024 -0.002  0.022 -0.476 -0.064  0.097              
## Ag.z:LstInP -0.009 -0.475  0.007 -0.004 -0.007  0.003  0.003  0.036       
## Ag.z:NxtSl. -0.012  0.151  0.001  0.002  0.030  0.015 -0.005 -0.067 -0.536
# model performance
somers2(fitted(m), sb$UM)
##            C          Dxy            n      Missing 
## 9.184639e-01 8.369279e-01 9.100100e+04 0.000000e+00

Interpretation

Older people and men are relatively less likely to use UM than younger people and women. When the hesitation marker’s duration is longer, when it is preceded or followed by a longer silence, or when it is the first or the last word in the phrase (i.e. if it is preceded or followed by a pause) it is more likely that the hesitation marker is equal to UM. The duration of the following pause and if the hesitation marker is the final word in the phrase are more predictive for the occurrence of UM than for younger people. This suggests that younger people are using UM also more when it is not the final word in the sentence and to mark shorter pauses.

Additional models

sb2 = unique(sb[,c("CallerID","Age","Age.z","Sex","nWords","nUM","nUH")])

dim(sb2)
## [1] 520   7
# average proportion UM as opposed to UH per speaker
mean(sb2$nUM / (sb2$nUM + sb2$nUH), na.rm=T)
## [1] 0.2825083
# percentage UM of all words
mean(sb2$nUM / sb2$nWords)
## [1] 0.007514793
# percentage UH of all words
mean(sb2$nUH / sb2$nWords)
## [1] 0.02207552
# Older people and men are relatively less likely to use UM
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1|CallerID), data=sb2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1 | CallerID)
##    Data: sb2
## 
##      AIC      BIC   logLik deviance df.resid 
##   4572.1   4589.1  -2282.1   4564.1      516 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.54416 -0.19297  0.01641  0.11739  0.86094 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  CallerID (Intercept) 0.8562   0.9253  
## Number of obs: 520, groups:  CallerID, 520
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.06100    0.06198  -81.66  < 2e-16 ***
## SexM        -0.48963    0.08561   -5.72 1.07e-08 ***
## Age.z       -0.33064    0.04299   -7.69 1.46e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) SexM  
## SexM  -0.725       
## Age.z -0.059  0.091
# Older people and men are relatively more likely to use UH
# (the age effect is stronger for women than for men)
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1|CallerID), data=sb2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1 | CallerID)
##    Data: sb2
## 
##      AIC      BIC   logLik deviance df.resid 
##   5413.9   5435.2  -2701.9   5403.9      515 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.42459 -0.14327  0.00744  0.09780  1.02197 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  CallerID (Intercept) 0.3643   0.6036  
## Number of obs: 520, groups:  CallerID, 520
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.47501    0.04107 -108.97  < 2e-16 ***
## SexM         0.79822    0.05536   14.42  < 2e-16 ***
## SexF:Age.z   0.31207    0.03986    7.83 4.89e-15 ***
## SexM:Age.z   0.20290    0.03823    5.31 1.11e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) SexM   SxF:A.
## SexM       -0.741              
## SexF:Age.z -0.130  0.096       
## SexM:Age.z  0.000  0.044  0.000
# Older people and men are relatively more likely to use either UH or UM
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1|CallerID), data=sb2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1 | CallerID)
##    Data: sb2
## 
##      AIC      BIC   logLik deviance df.resid 
##   5657.0   5674.0  -2824.5   5649.0      516 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.69908 -0.09884  0.00858  0.10785  0.91325 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  CallerID (Intercept) 0.2731   0.5226  
## Number of obs: 520, groups:  CallerID, 520
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.89140    0.03494 -111.37  < 2e-16 ***
## SexM         0.46459    0.04754    9.77  < 2e-16 ***
## Age.z        0.10218    0.02373    4.31 1.66e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) SexM  
## SexM  -0.737       
## Age.z -0.076  0.090

American English: Fisher

download.file('http://www.let.rug.nl/wieling/uhum/English/Fisher/fisher.rda','fisher.rda')
load('fisher.rda')

dim(fisher)
## [1] 19753    16
str(fisher)
## 'data.frame':    19753 obs. of  16 variables:
##  $ SpeakerID      : Factor w/ 10313 levels "1006","1007",..: 2517 2517 2518 2518 2518 2519 2520 2520 2520 1 ...
##  $ Call           : Factor w/ 19753 levels "00001_A","00001_B",..: 8401 8514 13335 13471 13638 5429 18105 18383 18438 651 ...
##  $ CallID         : Factor w/ 11355 levels "00001","00002",..: 4740 4804 7527 7607 7703 3048 10311 10493 10527 391 ...
##  $ Sex            : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 2 1 ...
##  $ Age            : int  18 18 31 31 31 33 38 38 38 19 ...
##  $ Edu            : int  11 11 18 18 18 16 12 12 12 13 ...
##  $ Loc            : Factor w/ 93 levels "Afghanistan",..: 19 19 91 91 91 22 70 70 70 70 ...
##  $ nWordsPerCall  : int  739 644 689 802 516 1242 970 1296 1050 416 ...
##  $ nUMPerCall     : int  2 5 16 14 12 15 8 9 8 10 ...
##  $ nUHPerCall     : int  0 1 0 0 1 0 2 16 31 0 ...
##  $ nWords         : int  1383 1383 2007 2007 2007 1242 3316 3316 3316 416 ...
##  $ nUM            : int  7 7 42 42 42 15 25 25 25 10 ...
##  $ nUH            : int  1 1 1 1 1 0 49 49 49 0 ...
##  $ Age.z          : num  -1.405 -1.405 -0.399 -0.399 -0.399 ...
##  $ Edu.z          : num  -1.33 -1.33 1.11 1.11 1.11 ...
##  $ nWordsPerCall.z: num  -0.683 -1.004 -0.852 -0.471 -1.436 ...
colnames(fisher)
##  [1] "SpeakerID"       "Call"            "CallID"         
##  [4] "Sex"             "Age"             "Edu"            
##  [7] "Loc"             "nWordsPerCall"   "nUMPerCall"     
## [10] "nUHPerCall"      "nWords"          "nUM"            
## [13] "nUH"             "Age.z"           "Edu.z"          
## [16] "nWordsPerCall.z"
  • SpeakerID – ID of the speaker
  • Call – ID of the side of the call
  • CallID – ID of the call
  • Sex – Gender of the speaker (F: Female, M: Male)
  • Age – Age of the speaker (in years)
  • Edu – Number of years of education
  • Loc – Location of call
  • nWordsPerCall – Word count per speaker in call
  • nUMPerCall – Frequency of UMs per speaker in call
  • nUHPerCall – Frequency of UHs per speaker in call
  • nWords – Total number of words per speaker
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • .z – Z-transformation of the numerical predictors
fisher$BirthYear = 2002 - fisher$Age
fi1 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","umuh","nUM")
fi2 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","umwords","nUM")
fi3 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","uhwords","nUM")
fi4 = visualizeUHUM(fisher,"SpeakerID","BirthYear","Sex","Fisher","uhmwords","nUM")
multiplot(fi1,fi2,fi4,fi3,cols=2)

Final model

# takes about 5 minutes
m <- glmer(cbind(nUMPerCall,nUHPerCall) ~ Sex + Age.z + Edu.z + nWordsPerCall.z + (1|SpeakerID) + (0+Age.z|CallID) +
                                          (1|CallID), data=fisher, family='binomial')
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.0017274 (tol =
## 0.001, component 1)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUMPerCall, nUHPerCall) ~ Sex + Age.z + Edu.z + nWordsPerCall.z +  
##     (1 | SpeakerID) + (0 + Age.z | CallID) + (1 | CallID)
##    Data: fisher
## 
##      AIC      BIC   logLik deviance df.resid 
##  84210.3  84273.4 -42097.1  84194.3    19745 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2889 -0.2048  0.0729  0.4642  2.2729 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  CallID    (Intercept) 3.58793  1.8942  
##  CallID.1  Age.z       0.07864  0.2804  
##  SpeakerID (Intercept) 1.12218  1.0593  
## Number of obs: 19753, groups:  CallID, 11355; SpeakerID, 10313
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.90785    0.02942   64.84  < 2e-16 ***
## SexM            -1.36907    0.03696  -37.04  < 2e-16 ***
## Age.z           -0.39052    0.01718  -22.74  < 2e-16 ***
## Edu.z            0.10539    0.01639    6.43 1.27e-10 ***
## nWordsPerCall.z  0.08612    0.01347    6.39 1.63e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SexM   Age.z  Edu.z 
## SexM        -0.552                     
## Age.z       -0.094  0.129              
## Edu.z        0.029 -0.073 -0.077       
## nWrdsPrCll.  0.003 -0.058 -0.144 -0.024
## convergence code: 0
## Model failed to converge with max|grad| = 0.0017274 (tol = 0.001, component 1)
# model performance
somers2( fitted(m), (fisher$nUMPerCall > fisher$nUHPerCall)*1 )
##            C          Dxy            n      Missing 
## 9.643999e-01 9.287997e-01 1.975300e+04 0.000000e+00
somers2( fitted(m), fisher$nUMPerCall / (fisher$nUMPerCall+fisher$nUHPerCall) )
##            C          Dxy            n      Missing 
## 9.768619e-01 9.537238e-01 1.910100e+04 6.520000e+02

Interpretation

Older people and men are relatively less likely to use UM than younger people and women. People who received more education are more likely to use UM than people who received less education. Finally, the more words people are pronouncing in a call, the more likely they are to use UM as opposed to UH.

Additional models

fisher2 = unique(fisher[,c("SpeakerID","Age","Age.z","Edu","Edu.z","Sex","nWords","nUM","nUH")])

dim(fisher2)
## [1] 10313     9
# average proportion UM as opposed to UH per speaker
mean(fisher2$nUM / (fisher2$nUM + fisher2$nUH), na.rm=T)
## [1] 0.6407968
# percentage UM of all words
mean(fisher2$nUM / fisher2$nWords)
## [1] 0.009861317
# percentage UH of all words
mean(fisher2$nUH / fisher2$nWords)
## [1] 0.006765215
# Older people, lower educated people, and men are relatively less likely to use UM 
# (the age and education effects are strongest for men; the education effect is stronger for older people)
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Sex + Sex:Age.z + Sex:Edu.z + Age.z:Edu.z+  (1|SpeakerID), data=fisher2, 
                     family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUM, nWords - nUM) ~ Sex + Sex:Age.z + Sex:Edu.z + Age.z:Edu.z +  
##     (1 | SpeakerID)
##    Data: fisher2
## 
##      AIC      BIC   logLik deviance df.resid 
##  76813.0  76870.9 -38398.5  76797.0    10305 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.76962 -0.28271  0.02718  0.18178  1.10784 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SpeakerID (Intercept) 0.6849   0.8276  
## Number of obs: 10313, groups:  SpeakerID, 10313
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.791767   0.011466  -417.9  < 2e-16 ***
## SexM        -0.352057   0.018350   -19.2  < 2e-16 ***
## SexF:Age.z  -0.065989   0.011796    -5.6 2.22e-08 ***
## SexM:Age.z  -0.137460   0.014076    -9.8  < 2e-16 ***
## SexF:Edu.z   0.056993   0.011470     5.0 6.74e-07 ***
## SexM:Edu.z   0.110022   0.014525     7.6 3.60e-14 ***
## Age.z:Edu.z  0.042092   0.008816     4.8 1.80e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SexM   SxF:A. SxM:A. SxF:E. SxM:E.
## SexM        -0.613                                   
## SexF:Age.z  -0.108  0.060                            
## SexM:Age.z   0.002  0.135  0.000                     
## SexF:Edu.z   0.043 -0.013 -0.028 -0.001              
## SexM:Edu.z  -0.003 -0.100  0.007 -0.178 -0.012       
## Age.z:Edu.z -0.022 -0.069  0.102  0.003 -0.187  0.066
# Older people and men are relatively more likely to use UH than younger people and wormen
# (the age effect is strongest for women)
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + Edu.z + (1|SpeakerID), data=fisher2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + Edu.z + (1 | SpeakerID)
##    Data: fisher2
## 
##      AIC      BIC   logLik deviance df.resid 
##  68396.7  68440.2 -34192.4  68384.7    10307 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.37056 -0.39042  0.03815  0.15388  1.34505 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SpeakerID (Intercept) 1.538    1.24    
## Number of obs: 10313, groups:  SpeakerID, 10313
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.16368    0.01871  -329.5   <2e-16 ***
## SexM         0.98717    0.02762    35.7   <2e-16 ***
## Edu.z       -0.02255    0.01364    -1.7   0.0983 .  
## SexF:Age.z   0.34914    0.01828    19.1   <2e-16 ***
## SexM:Age.z   0.20889    0.01994    10.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) SexM   Edu.z  SxF:A.
## SexM       -0.649                     
## Edu.z       0.038 -0.077              
## SexF:Age.z -0.169  0.110  0.006       
## SexM:Age.z -0.010  0.099 -0.112  0.000
# Men, older people and higher educated people are relatively more likely to use either UH or UM
# (the education effect is strongest for older people)
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z*Edu.z + (1|SpeakerID), data=fisher2, 
                       family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z * Edu.z +  
##     (1 | SpeakerID)
##    Data: fisher2
## 
##      AIC      BIC   logLik deviance df.resid 
##  85657.2  85700.6 -42822.6  85645.2    10307 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.06427 -0.22205  0.02361  0.16397  0.85843 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SpeakerID (Intercept) 0.5099   0.7141  
## Number of obs: 10313, groups:  SpeakerID, 10313
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.404077   0.009791  -449.8  < 2e-16 ***
## SexM         0.209628   0.015382    13.6  < 2e-16 ***
## Age.z        0.063810   0.007561     8.4  < 2e-16 ***
## Edu.z        0.031664   0.007594     4.2 3.05e-05 ***
## Age.z:Edu.z  0.026231   0.007263     3.6 0.000304 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) SexM   Age.z  Edu.z 
## SexM        -0.639                     
## Age.z       -0.094  0.124              
## Edu.z        0.043 -0.067 -0.086       
## Age.z:Edu.z -0.012 -0.058  0.099 -0.123

American English: Philadelphia Neighborhood Corpus

download.file('http://www.let.rug.nl/wieling/uhum/English/PNC/pnc.rda','pnc.rda')
load('pnc.rda')

dim(pnc)
## [1] 25514    39
str(pnc)
## 'data.frame':    25514 obs. of  39 variables:
##  $ idstring                 : Factor w/ 395 levels "PH00-1-1-","PH00-1-2-",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ birthyear                : int  1979 1979 1979 1979 1979 1979 1979 1979 1979 1979 ...
##  $ age                      : int  21 21 21 21 21 21 21 21 21 21 ...
##  $ sex                      : Factor w/ 2 levels "f","m": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ethnicity                : Factor w/ 36 levels "","a","g","g/i",..: 13 13 13 13 13 13 13 13 13 13 ...
##  $ schooling                : num  14 14 14 14 14 14 14 14 14 14 ...
##  $ recyear                  : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ word                     : Factor w/ 4 levels "AND_UH","AND_UM",..: 3 3 4 3 3 3 3 3 4 3 ...
##  $ start_time               : num  24.4 35 37.9 44.5 57.6 ...
##  $ end_time                 : num  24.7 35.2 38.3 44.7 57.8 ...
##  $ vowel_start              : num  24.4 35 37.9 44.5 57.6 ...
##  $ vowel_end                : num  24.7 35.2 38.1 44.7 57.8 ...
##  $ nasal_start              : num  NA NA 38.1 NA NA ...
##  $ nasal_end                : num  NA NA 38.3 NA NA ...
##  $ next_seg                 : Factor w/ 49 levels "AA1","AA2","AE0",..: 42 22 44 17 11 44 44 44 44 44 ...
##  $ next_seg_start           : num  24.7 35.2 38.3 44.7 57.8 ...
##  $ next_seg_end             : num  24.9 35.4 38.4 44.7 57.9 ...
##  $ next_silence_length      : num  0 0 0.12 0 0 ...
##  $ next_silence_length.log  : num  -5.3 -5.3 -2.08 -5.3 -5.3 ...
##  $ chunk_start              : num  24.4 35 37.9 44.5 57.6 ...
##  $ chunk_end                : num  25.3 37.1 38.8 45.6 59 ...
##  $ transcribed              : num  2811 2811 2811 2811 2811 ...
##  $ total                    : num  2814 2814 2814 2814 2814 ...
##  $ nvowels                  : int  3078 3078 3078 3078 3078 3078 3078 3078 3078 3078 ...
##  $ duration                 : num  0.3 0.28 0.37 0.2 0.18 ...
##  $ pausebefore              : num  1 1 1 1 1 0 0 0 1 0 ...
##  $ pauseafter               : num  0 0 0 0 0 1 1 1 1 1 ...
##  $ UM                       : num  0 0 1 0 0 0 0 0 1 0 ...
##  $ nwords                   : int  6551 6551 6551 6551 6551 6551 6551 6551 6551 6551 ...
##  $ nUM                      : num  60 60 60 60 60 60 60 60 60 60 ...
##  $ nUH                      : num  90 90 90 90 90 90 90 90 90 90 ...
##  $ birthyear.z              : num  1.4 1.4 1.4 1.4 1.4 ...
##  $ age.z                    : num  -0.949 -0.949 -0.949 -0.949 -0.949 ...
##  $ schooling.z              : num  0.697 0.697 0.697 0.697 0.697 ...
##  $ recyear.z                : num  1.07 1.07 1.07 1.07 1.07 ...
##  $ next_silence_length.z    : num  -0.0361 -0.0361 -0.0324 -0.0361 -0.0361 ...
##  $ next_silence_length.log.z: num  -1.4297 -1.4297 -0.0274 -1.4297 -1.4297 ...
##  $ duration.z               : num  0.265 0.14 0.705 -0.363 -0.488 ...
##  $ nwords.z                 : num  1.13 1.13 1.13 1.13 1.13 ...
colnames(pnc)
##  [1] "idstring"                  "birthyear"                
##  [3] "age"                       "sex"                      
##  [5] "ethnicity"                 "schooling"                
##  [7] "recyear"                   "word"                     
##  [9] "start_time"                "end_time"                 
## [11] "vowel_start"               "vowel_end"                
## [13] "nasal_start"               "nasal_end"                
## [15] "next_seg"                  "next_seg_start"           
## [17] "next_seg_end"              "next_silence_length"      
## [19] "next_silence_length.log"   "chunk_start"              
## [21] "chunk_end"                 "transcribed"              
## [23] "total"                     "nvowels"                  
## [25] "duration"                  "pausebefore"              
## [27] "pauseafter"                "UM"                       
## [29] "nwords"                    "nUM"                      
## [31] "nUH"                       "birthyear.z"              
## [33] "age.z"                     "schooling.z"              
## [35] "recyear.z"                 "next_silence_length.z"    
## [37] "next_silence_length.log.z" "duration.z"               
## [39] "nwords.z"
  • idstring – Unique id string for each speaker
  • birthyear – Year of birth of the speaker
  • age – Age of speaker at time of interview
  • sex – Gender of speaker (f: female, m: male)
  • ethnicity – A number of single character codes for speakers’ ethnicity
  • schooling – Number of years of schooling for this speaker
  • recyear – Year of interview
  • word – UM, UH, AND_UH, or AND_UM (UM_UH occurrences were removed)
  • start_time – Onset time of filled pause, in seconds
  • end_time – Offset time of filled pause, in seconds
  • vowel_start – Onset time of filled pause vowel
  • vowel_end – Offset time of filled pause vowel
  • nasal_start – Onset time of nasal, if UM
  • nasal_end – Offset time of nasal, if UM
  • next_seg – The next segment on the phone tier
  • next_seg_start – Start time of next segment
  • next_seg_end – End time of next segment
  • next_silence_length – Length of silence after UH/UM
  • next_silence_length.log – Log-transformation (after adding 0.005 sec.) of the length of the silence following UM/UH
  • chunk_start – Onset time of speech chunk (i.e. continuous speech bordered on each side by a pause of 200 ms. or more)
  • chunk_end – Offset time of speech chunk
  • transcribed – Total number of transcribed seconds of interview
  • total – Total length of recording
  • nvowels – Number of measured stressed vowels (irrelevant to UM/UH)
  • duration – Duration of the word
  • pausebefore – A binary variable indicating if the word is at the start of the chunck (and has a pause before: 1) or not (0)
  • pauseafter – A binary variable indicating if the word is at the end of the chunck (and has a pause after: 1) or not (0)
  • UM – A binary variable indicating if the word is equal to UM / AND_UM (1) or UH / AND_UH (0)
  • nwords – Total number of words per speaker
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • .z – Z-transformation of the numerical predictors
p1 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC")
p2 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC","umwords",nwordVar='nwords')
p3 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC","uhwords",nwordVar='nwords')
p4 = visualizeUHUM(pnc,"idstring","birthyear","sex","PNC","uhmwords",nwordVar='nwords')
multiplot(p1,p2,p4,p3,cols=2)

n5 = visualizeUHUM(pnc,"idstring","recyear","sex","PNC")
n5

Final model

# takes a few minutes
m <- glmer(UM ~ sex * age.z +  recyear.z  + duration.z  + pauseafter + age.z*next_silence_length.log.z + 
                (1+duration.z+pauseafter+next_silence_length.log.z|idstring), 
           family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa")) 
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: UM ~ sex * age.z + recyear.z + duration.z + pauseafter + age.z *  
##     next_silence_length.log.z + (1 + duration.z + pauseafter +  
##     next_silence_length.log.z | idstring)
##    Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  14197.7  14352.5  -7079.9  14159.7    25495 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -18.1707  -0.2717  -0.1085   0.0162  31.3339 
## 
## Random effects:
##  Groups   Name                      Variance Std.Dev. Corr             
##  idstring (Intercept)               1.8945   1.3764                    
##           duration.z                0.5533   0.7438    0.48            
##           pauseafter                0.2606   0.5105    0.12  0.11      
##           next_silence_length.log.z 0.2260   0.4753   -0.14 -0.12 -0.10
## Number of obs: 25514, groups:  idstring, 395
## 
## Fixed effects:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.75359    0.12130 -14.456  < 2e-16 ***
## sexm                            -1.31029    0.18017  -7.272 3.53e-13 ***
## age.z                           -1.16971    0.10428 -11.217  < 2e-16 ***
## recyear.z                        0.54344    0.08132   6.682 2.35e-11 ***
## duration.z                       1.25023    0.05590  22.364  < 2e-16 ***
## pauseafter                       0.59255    0.10983   5.395 6.85e-08 ***
## next_silence_length.log.z        0.54594    0.06422   8.501  < 2e-16 ***
## sexm:age.z                      -0.51507    0.17604  -2.926 0.003435 ** 
## age.z:next_silence_length.log.z  0.17922    0.04760   3.765 0.000166 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) sexm   age.z  rcyr.z drtn.z pasftr nx__.. sxm:g.
## sexm        -0.510                                                 
## age.z        0.033  0.040                                          
## recyear.z    0.000 -0.065 -0.141                                   
## duration.z   0.206 -0.104 -0.066  0.042                            
## pauseafter  -0.415 -0.101 -0.020  0.005  0.023                     
## nxt_slnc_..  0.203  0.025 -0.033  0.008 -0.045 -0.660              
## sexm:age.z  -0.013  0.110 -0.553 -0.022 -0.023 -0.001 -0.031       
## ag.z:nx__.. -0.061 -0.016 -0.082 -0.007  0.011 -0.005  0.179 -0.046
# model performance
somers2(fitted(m), pnc$UM)
##            C          Dxy            n      Missing 
## 9.497833e-01 8.995665e-01 2.551400e+04 0.000000e+00

Interpretation

Older people and men are relatively less likely to use UM than younger people and women (the age effect is strongest for men, as the interaction shows). People recorded in more recent years are more likely to use UM than people recorded earlier. When the hesitation marker’s duration is longer, when it is followed by a pause (of at least 200 ms.), and when the following pause is longer (especially so for older people), it is more likely that the hesitation marker is equal to UM. The presence of a pause before the hesitation marker does not significantly distinguish UM from UH (see below).

Alternative final model

The following (slightly better fitting) model uses year of birth and age, rather than year of recording and age and shows that the age effect is only limited, as year of birth is the most important predictor. This means that the year of birth is the determining factor for the amount of UM people use, as opposed to their age. Furthermore, the predictive power of the duration of the silence following the hesitation marker is diminishing for people born later. This suggests that people born later are using UM more in general, so not only denoting longer pauses.

# takes a few minutes
m2 <- glmer(UM ~ sex + sex : birthyear.z +  age.z  + duration.z  + pauseafter + 
                 birthyear.z*next_silence_length.log.z +
                 (1+duration.z+pauseafter+next_silence_length.log.z|idstring),   
                 family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa"))

summary(m2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: UM ~ sex + sex:birthyear.z + age.z + duration.z + pauseafter +  
##     birthyear.z * next_silence_length.log.z + (1 + duration.z +  
##     pauseafter + next_silence_length.log.z | idstring)
##    Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  14193.0  14347.8  -7077.5  14155.0    25495 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -18.258  -0.272  -0.108   0.017  33.671 
## 
## Random effects:
##  Groups   Name                      Variance Std.Dev. Corr             
##  idstring (Intercept)               1.8419   1.3572                    
##           duration.z                0.5552   0.7451    0.49            
##           pauseafter                0.2659   0.5157    0.16  0.12      
##           next_silence_length.log.z 0.2321   0.4818   -0.15 -0.10 -0.14
## Number of obs: 25514, groups:  idstring, 395
## 
## Fixed effects:
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                           -1.75419    0.12039 -14.571  < 2e-16
## sexm                                  -1.31878    0.17863  -7.383 1.55e-13
## age.z                                 -0.32691    0.15898  -2.056 0.039750
## duration.z                             1.24945    0.05594  22.336  < 2e-16
## pauseafter                             0.58653    0.10978   5.343 9.16e-08
## birthyear.z                            0.88750    0.17478   5.078 3.82e-07
## next_silence_length.log.z              0.55821    0.06476   8.620  < 2e-16
## sexm:birthyear.z                       0.64698    0.18000   3.594 0.000325
## birthyear.z:next_silence_length.log.z -0.18944    0.04928  -3.844 0.000121
##                                          
## (Intercept)                           ***
## sexm                                  ***
## age.z                                 *  
## duration.z                            ***
## pauseafter                            ***
## birthyear.z                           ***
## next_silence_length.log.z             ***
## sexm:birthyear.z                      ***
## birthyear.z:next_silence_length.log.z ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) sexm   age.z  drtn.z pasftr brthy. nx__.. sxm:b.
## sexm        -0.509                                                 
## age.z        0.009 -0.001                                          
## duration.z   0.211 -0.104 -0.015                                   
## pauseafter  -0.411 -0.098 -0.009  0.022                            
## birthyear.z -0.013 -0.032  0.798  0.030  0.012                     
## nxt_slnc_..  0.191  0.025  0.024 -0.034 -0.660  0.037              
## sxm:brthyr.  0.014 -0.107 -0.017  0.019 -0.016 -0.349  0.049       
## brthy.:__..  0.075  0.015  0.025 -0.014  0.001 -0.028 -0.205 -0.043
# model performance
somers2(fitted(m2), pnc$UM)
##            C          Dxy            n      Missing 
## 9.497548e-01 8.995096e-01 2.551400e+04 0.000000e+00
AIC(m) # model on the basis of age and year of recording
## [1] 14197.72
AIC(m2) # model on the basis of year of birth and age (better: lower AIC)
## [1] 14193

Additional models

pnc2 = unique(pnc[,c("idstring","age","age.z","sex","recyear","recyear.z","nwords","nUM","nUH")])

dim(pnc2)
## [1] 395   9
# average proportion UM as opposed to UH per speaker
mean(pnc2$nUM / (pnc2$nUM + pnc2$nUH), na.rm=T)
## [1] 0.2765317
# percentage UM of all words
mean(pnc2$nUM / pnc2$nwords)
## [1] 0.004543936
# percentage UH of all words
mean(pnc2$nUH / pnc2$nwords)
## [1] 0.01321371
# Older people, people who were recorded earlier, and men are relatively less likely to use UM
summary(mUM <- glmer(cbind(nUM, nwords - nUM) ~ sex + age.z + recyear.z + (1|idstring), data=pnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUM, nwords - nUM) ~ sex + age.z + recyear.z + (1 | idstring)
##    Data: pnc2
## 
##      AIC      BIC   logLik deviance df.resid 
##   2680.0   2699.9  -1335.0   2670.0      390 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.33970 -0.33877  0.01887  0.14231  0.91321 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  idstring (Intercept) 1.326    1.151   
## Number of obs: 395, groups:  idstring, 395
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.80473    0.08402  -69.09  < 2e-16 ***
## sexm        -1.04225    0.13214   -7.89 3.08e-15 ***
## age.z       -0.69226    0.06650  -10.41  < 2e-16 ***
## recyear.z    0.39701    0.06384    6.22 5.01e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) sexm   age.z 
## sexm      -0.605              
## age.z      0.062  0.044       
## recyear.z -0.039 -0.022 -0.131
# Younger people, people who were more recently recorded, and women are less likely to use UH 
summary(mUH <- glmer(cbind(nUH, nwords - nUH) ~ sex + age.z + recyear.z + (1|idstring), data=pnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH, nwords - nUH) ~ sex + age.z + recyear.z + (1 | idstring)
##    Data: pnc2
## 
##      AIC      BIC   logLik deviance df.resid 
##   3512.8   3532.7  -1751.4   3502.8      390 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.52779 -0.20029 -0.00452  0.13236  0.90533 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  idstring (Intercept) 0.5462   0.739   
## Number of obs: 395, groups:  idstring, 395
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.86681    0.05270  -92.34  < 2e-16 ***
## sexm         0.40644    0.07870    5.16 2.41e-07 ***
## age.z        0.46963    0.03962   11.85  < 2e-16 ***
## recyear.z   -0.24388    0.03954   -6.17 6.95e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) sexm   age.z 
## sexm      -0.666              
## age.z     -0.047  0.010       
## recyear.z  0.017 -0.003 -0.109
# Only older people are relatively more likely to use either UH or UM.
summary(mUHUM <- glmer(cbind(nUH + nUM, nwords - nUH - nUM) ~ sex + age.z + recyear.z + (1|idstring), data=pnc2, 
                       family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH + nUM, nwords - nUH - nUM) ~ sex + age.z + recyear.z +  
##     (1 | idstring)
##    Data: pnc2
## 
##      AIC      BIC   logLik deviance df.resid 
##   3776.9   3796.8  -1883.4   3766.9      390 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.81074 -0.17015  0.01041  0.11135  0.63540 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  idstring (Intercept) 0.4546   0.6742  
## Number of obs: 395, groups:  idstring, 395
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.27471    0.04703  -90.90  < 2e-16 ***
## sexm         0.09042    0.07094    1.27    0.202    
## age.z        0.17214    0.03547    4.85 1.21e-06 ***
## recyear.z   -0.04180    0.03530   -1.18    0.236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) sexm   age.z 
## sexm      -0.661              
## age.z     -0.016 -0.001       
## recyear.z  0.000  0.001 -0.096
# Effect size of schooling (n.s.)
summary(m0 <- glmer(UM ~ sex + sex : age.z +  recyear.z  + duration.z  + pauseafter + schooling.z 
                         + age.z*next_silence_length.log.z +
                         (1+duration.z+pauseafter+next_silence_length.log.z|idstring), 
           family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: UM ~ sex + sex:age.z + recyear.z + duration.z + pauseafter +  
##     schooling.z + age.z * next_silence_length.log.z + (1 + duration.z +  
##     pauseafter + next_silence_length.log.z | idstring)
##    Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  13261.3  13422.8  -6610.7  13221.3    23716 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -17.3792  -0.2726  -0.1111   0.0059  30.1706 
## 
## Random effects:
##  Groups   Name                      Variance Std.Dev. Corr             
##  idstring (Intercept)               1.9516   1.3970                    
##           duration.z                0.5718   0.7562    0.48            
##           pauseafter                0.2564   0.5063    0.06  0.13      
##           next_silence_length.log.z 0.1984   0.4454   -0.10 -0.08 -0.07
## Number of obs: 23736, groups:  idstring, 361
## 
## Fixed effects:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.75969    0.12737 -13.815  < 2e-16 ***
## sexm                            -1.35362    0.19247  -7.033 2.02e-12 ***
## recyear.z                        0.51473    0.08807   5.845 5.07e-09 ***
## duration.z                       1.24921    0.05897  21.185  < 2e-16 ***
## pauseafter                       0.59326    0.11546   5.138 2.77e-07 ***
## schooling.z                      0.03201    0.08752   0.366   0.7145    
## age.z                           -1.15043    0.11161 -10.308  < 2e-16 ***
## next_silence_length.log.z        0.53945    0.06656   8.104 5.31e-16 ***
## sexm:age.z                      -0.43060    0.19086  -2.256   0.0241 *  
## age.z:next_silence_length.log.z  0.16061    0.04957   3.240   0.0012 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) sexm   rcyr.z drtn.z pasftr schln. age.z  nx__.. sxm:g.
## sexm        -0.506                                                        
## recyear.z    0.027 -0.064                                                 
## duration.z   0.214 -0.114  0.030                                          
## pauseafter  -0.425 -0.099  0.015  0.025                                   
## schooling.z  0.041 -0.126 -0.228  0.041 -0.019                            
## age.z        0.040  0.052 -0.073 -0.078 -0.015 -0.171                     
## nxt_slnc_..  0.216  0.028 -0.017 -0.028 -0.666  0.028 -0.028              
## sexm:age.z  -0.025  0.099 -0.028 -0.010  0.017  0.051 -0.551 -0.048       
## ag.z:nx__.. -0.051 -0.016 -0.030  0.011 -0.015  0.064 -0.086  0.197 -0.045
# Effect size of pause before hesitation marker (n.s.)
summary(m1 <- glmer(UM ~ sex * age.z +  recyear.z  + duration.z  + pauseafter + pausebefore + 
                         age.z*next_silence_length.log.z + 
                         (1+duration.z+pauseafter+pausebefore+next_silence_length.log.z|idstring), 
                    family='binomial', data=pnc, control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## UM ~ sex * age.z + recyear.z + duration.z + pauseafter + pausebefore +  
##     age.z * next_silence_length.log.z + (1 + duration.z + pauseafter +  
##     pausebefore + next_silence_length.log.z | idstring)
##    Data: pnc
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  14087.0  14290.7  -7018.5  14037.0    25489 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -18.5707  -0.2646  -0.1044   0.0143  27.8927 
## 
## Random effects:
##  Groups   Name                      Variance Std.Dev. Corr             
##  idstring (Intercept)               2.3123   1.5206                    
##           duration.z                0.5647   0.7514    0.53            
##           pauseafter                0.2338   0.4835    0.13  0.16      
##           pausebefore               0.8973   0.9473   -0.48 -0.15 -0.13
##           next_silence_length.log.z 0.2518   0.5018   -0.12 -0.12 -0.17
##       
##       
##       
##       
##       
##   0.03
## Number of obs: 25514, groups:  idstring, 395
## 
## Fixed effects:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.76487    0.13072 -13.501  < 2e-16 ***
## sexm                            -1.34238    0.17812  -7.536 4.84e-14 ***
## age.z                           -1.14816    0.10460 -10.976  < 2e-16 ***
## recyear.z                        0.56672    0.08014   7.072 1.53e-12 ***
## duration.z                       1.27442    0.05745  22.184  < 2e-16 ***
## pauseafter                       0.61949    0.11095   5.583 2.36e-08 ***
## pausebefore                     -0.08419    0.08720  -0.965 0.334304    
## next_silence_length.log.z        0.55114    0.06577   8.380  < 2e-16 ***
## sexm:age.z                      -0.49232    0.17459  -2.820 0.004804 ** 
## age.z:next_silence_length.log.z  0.18753    0.04949   3.789 0.000151 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) sexm   age.z  rcyr.z drtn.z pasftr pasbfr nx__.. sxm:g.
## sexm        -0.475                                                        
## age.z       -0.003  0.042                                                 
## recyear.z   -0.003 -0.065 -0.128                                          
## duration.z   0.267 -0.116 -0.088  0.044                                   
## pauseafter  -0.404 -0.097 -0.008  0.008  0.028                            
## pausebefore -0.384  0.045  0.106 -0.013 -0.192  0.015                     
## nxt_slnc_..  0.186  0.018 -0.037  0.011 -0.049 -0.663  0.029              
## sexm:age.z  -0.028  0.136 -0.533 -0.026 -0.031 -0.005  0.019 -0.025       
## ag.z:nx__.. -0.061 -0.014 -0.084 -0.011  0.006 -0.003  0.023  0.168 -0.044

m <- glmer(UM ~ sex * age.z + recyear.z + duration.z + pauseafter + age.z*next_silence_length.log.z + (1+duration.z+pauseafter+next_silence_length.log.z|idstring), family=‘binomial’, data=pnc, control=glmerControl(optimizer=“bobyqa”)) summary(m)

United Kingdom English: British National Corpus

download.file('http://www.let.rug.nl/wieling/uhum/English/BNC/bnc.rda','bnc.rda')
load('bnc.rda')

dim(bnc)
## [1] 25498    13
str(bnc)
## 'data.frame':    25498 obs. of  13 variables:
##  $ speaker     : Factor w/ 960 levels "PS002","PS003",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ age         : int  60 60 60 60 60 60 60 60 60 60 ...
##  $ gender      : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 1 1 ...
##  $ word        : Factor w/ 2 levels "UH","UM": 1 1 1 1 2 1 2 2 2 1 ...
##  $ duration    : num  0.1 0.16 0.09 0.03 0.06 0.07 0.09 0.06 0.06 0.03 ...
##  $ pauseafter  : num  0.09 0.22 2.85 0.26 0.32 0.49 0.57 0.22 0.07 0.54 ...
##  $ UM          : num  0 0 0 0 1 0 1 1 1 0 ...
##  $ nWords      : int  3295 3295 3295 3295 3295 3295 3295 3295 3295 3295 ...
##  $ nUM         : num  14 14 14 14 14 14 14 14 14 14 ...
##  $ nUH         : num  14 14 14 14 14 14 14 14 14 14 ...
##  $ age.z       : num  1.1 1.1 1.1 1.1 1.1 ...
##  $ duration.z  : num  -0.3961 0.0424 -0.4692 -0.9078 -0.6885 ...
##  $ pauseafter.z: num  -0.599 -0.482 1.88 -0.446 -0.392 ...
colnames(bnc)
##  [1] "speaker"      "age"          "gender"       "word"        
##  [5] "duration"     "pauseafter"   "UM"           "nWords"      
##  [9] "nUM"          "nUH"          "age.z"        "duration.z"  
## [13] "pauseafter.z"
  • speaker – ID of the speaker
  • age – Age of the speaker (in years)
  • gender – Gender of the speaker (F: Female, M: Male)
  • word – UH or UM
  • duration – Duration of the hesitation marker (in seconds)
  • pauseafter – The amount of silence following the hesitation marker (in sec.)
  • UM – A binary variable indicating if the word is equal to UM (1) or UH (0)
  • nWords – Total number of words per speaker
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • .z – Z-transformation of the numerical predictors
bnc$birthyear = 1993 - bnc$age
b1 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC")
b2 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC","umwords")
b3 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC","uhwords")
b4 = visualizeUHUM(bnc,"speaker","birthyear","gender","BNC","uhmwords")
multiplot(b1,b2,b4,b3,cols=2)

Final model

# takes about 80 seconds
m <- glmer(UM ~ age.z + gender + duration.z + pauseafter.z + (0+pauseafter.z|speaker) + 
                (1+duration.z+pauseafter.z|speaker),
           data=bnc,family='binomial', control=glmerControl(optimizer="bobyqa"))
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## UM ~ age.z + gender + duration.z + pauseafter.z + (0 + pauseafter.z |  
##     speaker) + (1 + duration.z + pauseafter.z | speaker)
##    Data: bnc
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  27372.2  27469.9 -13674.1  27348.2    25486 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -41.989  -0.656  -0.134   0.659   6.485 
## 
## Random effects:
##  Groups    Name         Variance Std.Dev. Corr     
##  speaker   pauseafter.z 0.06095  0.2469            
##  speaker.1 (Intercept)  2.21248  1.4874            
##            duration.z   0.56188  0.7496   0.75     
##            pauseafter.z 0.11419  0.3379   0.40 0.16
## Number of obs: 25498, groups:  speaker, 960
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.44490    0.08056   5.523 3.34e-08 ***
## age.z        -0.45377    0.04788  -9.477  < 2e-16 ***
## genderm      -0.44894    0.09658  -4.648 3.35e-06 ***
## duration.z    1.06549    0.04716  22.594  < 2e-16 ***
## pauseafter.z  0.44329    0.03527  12.567  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) age.z  gendrm drtn.z
## age.z       -0.027                     
## genderm     -0.650  0.035              
## duration.z   0.428  0.065  0.037       
## pauseaftr.z  0.161  0.017 -0.010  0.019
# model performance
somers2(fitted(m), bnc$UM)
##            C          Dxy            n      Missing 
## 8.532137e-01 7.064274e-01 2.549800e+04 0.000000e+00

Interpretation

Older people and men are relatively less likely to use UM than younger people and women. When the hesitation marker’s duration is longer, and when it is followed by longer pause it is more likely that the hesitation marker is equal to UM.

Additional models

bnc2 = unique(bnc[,c("speaker","age","age.z","gender","nWords","nUM","nUH")])

dim(bnc2)
## [1] 960   7
# average proportion UM as opposed to UH per speaker
mean(bnc2$nUM / (bnc2$nUM + bnc2$nUH), na.rm=T)
## [1] 0.4611727
# percentage UM of all words
mean(bnc2$nUM / bnc2$nWords)
## [1] 0.004269795
# percentage UH of all words
mean(bnc2$nUH / bnc2$nWords)
## [1] 0.004511923
# Older people are relatively less likely to use UM
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ gender + age.z + (1|speaker), data=bnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ gender + age.z + (1 | speaker)
##    Data: bnc2
## 
##      AIC      BIC   logLik deviance df.resid 
##   5774.9   5794.4  -2883.4   5766.9      956 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.42764 -0.40072 -0.00744  0.18734  2.30172 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 1.112    1.055   
## Number of obs: 960, groups:  speaker, 960
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.11742    0.06023 -101.57  < 2e-16 ***
## genderm      0.09470    0.07998    1.18  0.23641    
## age.z       -0.12043    0.03978   -3.03  0.00247 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) gendrm
## genderm -0.734       
## age.z   -0.004  0.000
# Men and older people are relatively more likely to use UH
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ gender + age.z + (1|speaker), data=bnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ gender + age.z + (1 | speaker)
##    Data: bnc2
## 
##      AIC      BIC   logLik deviance df.resid 
##   5703.5   5722.9  -2847.7   5695.5      956 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7733 -0.3212  0.0101  0.2556  3.8537 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.6468   0.8042  
## Number of obs: 960, groups:  speaker, 960
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.20236    0.04888 -126.89   <2e-16 ***
## genderm      0.54884    0.06356    8.64   <2e-16 ***
## age.z        0.27783    0.03134    8.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) gendrm
## genderm -0.758       
## age.z   -0.073  0.006
# Older people and men are relatively more likely to use either UH or UM
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ gender + age.z + (1|speaker), data=bnc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ gender + age.z + (1 |  
##     speaker)
##    Data: bnc2
## 
##      AIC      BIC   logLik deviance df.resid 
##   6828.5   6848.0  -3410.3   6820.5      956 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4357 -0.2617 -0.0042  0.2063  3.4355 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.5352   0.7316  
## Number of obs: 960, groups:  speaker, 960
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.30618    0.04092 -129.67  < 2e-16 ***
## genderm      0.35603    0.05412    6.58 4.75e-11 ***
## age.z        0.11544    0.02670    4.32 1.54e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr) gendrm
## genderm -0.753       
## age.z   -0.028 -0.005

Scottish English: HCRC Map Task Corpus

download.file('http://www.let.rug.nl/wieling/uhum/English/HCRC/hcrc.rda','hcrc.rda')
load('hcrc.rda')

dim(hcrc)
## [1] 1987   18
str(hcrc)
## 'data.frame':    1987 obs. of  18 variables:
##  $ SpeakerID       : Factor w/ 64 levels "q1eta1","q1eta2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age             : int  20 20 20 20 20 20 20 20 20 20 ...
##  $ Sex             : Factor w/ 2 levels "f","m": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ConversationID  : Factor w/ 128 levels "q1ec1","q1ec2",..: 1 1 1 1 6 7 3 7 3 6 ...
##  $ Role            : Factor w/ 2 levels "f","g": 2 2 2 2 1 2 1 2 1 1 ...
##  $ Origin          : Factor w/ 22 levels "Aberdeen","Aldershot",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ Word            : Factor w/ 7 levels "eh","ehm","er",..: 4 4 4 4 5 3 5 1 6 2 ...
##  $ Position        : int  8 16 8 9 1 1 1 3 8 1 ...
##  $ nWordsInPhrase  : int  29 29 18 11 2 17 8 3 13 5 ...
##  $ FirstInPhrase   : num  0 0 0 0 1 1 1 0 0 1 ...
##  $ LastInPhrase    : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ UM              : num  1 1 1 1 1 0 1 0 0 1 ...
##  $ nWords          : int  2102 2102 2102 2102 2102 2102 2102 2102 2102 2102 ...
##  $ nUM             : num  7 7 7 7 7 7 7 7 7 7 ...
##  $ nUH             : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ Age.z           : num  0.0975 0.0975 0.0975 0.0975 0.0975 ...
##  $ Position.z      : num  0.602 1.904 0.602 0.765 -0.538 ...
##  $ nWordsInPhrase.z: num  1.6615 1.6615 0.6237 -0.0367 -0.8858 ...
colnames(hcrc)
##  [1] "SpeakerID"        "Age"              "Sex"             
##  [4] "ConversationID"   "Role"             "Origin"          
##  [7] "Word"             "Position"         "nWordsInPhrase"  
## [10] "FirstInPhrase"    "LastInPhrase"     "UM"              
## [13] "nWords"           "nUM"              "nUH"             
## [16] "Age.z"            "Position.z"       "nWordsInPhrase.z"
  • SpeakerID – ID of the speaker
  • Age – Age of the speaker (in years)
  • Sex – Gender of the speaker (f: female, m: male)
  • ConversationID – ID of the conversation
  • Role – Role of the speaker (g: guide, f: follower)
  • Origin – Place of origin of the speaker
  • Word – Word: eh, er, uh (all classified as UH), and ehm, erm, mm, um (classiefied as UM)
  • Position – Position of the word in the phrase
  • nWordsInPhrase – Number of words in the phrase
  • FirstInPhrase – The first word in the phrase is marked with 1, otherwise 0
  • LastInPhrase – The last word in the phrase is marked with 1, otherwise 0
  • UM – A binary variable indicating if the word is equal to UM (1: ehm, erm, mm, um) or UH (0: eh, er, uh)
  • nWords – Total number of words per speaker
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • .z – Z-transformation of the numerical predictors
hcrc$BirthYear = 1990 - hcrc$Age
h1 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC")
h2 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC","umwords")
h3 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC","uhwords")
h4 = visualizeUHUM(hcrc,"SpeakerID","BirthYear","Sex","HCRC","uhmwords")
multiplot(h1,h2,h4,h3,cols=2)

Final model

# takes about 1 second
m <- glmer(UM ~ Age.z + Sex + FirstInPhrase + LastInPhrase + (1+FirstInPhrase|SpeakerID) + (1|ConversationID),
           data=hcrc, family='binomial', control=glmerControl(optimizer="bobyqa"))
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## UM ~ Age.z + Sex + FirstInPhrase + LastInPhrase + (1 + FirstInPhrase |  
##     SpeakerID) + (1 | ConversationID)
##    Data: hcrc
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   2008.2   2058.6   -995.1   1990.2     1978 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2692 -0.5310  0.2290  0.5218  4.1452 
## 
## Random effects:
##  Groups         Name          Variance Std.Dev. Corr 
##  ConversationID (Intercept)   0.4517   0.6721        
##  SpeakerID      (Intercept)   1.7322   1.3161        
##                 FirstInPhrase 0.8588   0.9267   -0.28
## Number of obs: 1987, groups:  ConversationID, 128; SpeakerID, 64
## 
## Fixed effects:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.8774     0.2867   3.061  0.00221 ** 
## Age.z          -0.3500     0.1924  -1.819  0.06886 .  
## Sexm           -2.2966     0.3935  -5.836 5.35e-09 ***
## FirstInPhrase   0.8340     0.1901   4.387 1.15e-05 ***
## LastInPhrase    1.0688     0.1505   7.101 1.24e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Age.z  Sexm   FrstIP
## Age.z       -0.016                     
## Sexm        -0.673  0.027              
## FirstInPhrs -0.258  0.021 -0.072       
## LastInPhras -0.134 -0.004 -0.024  0.066
# model performance
somers2(fitted(m), hcrc$UM)
##            C          Dxy            n      Missing 
##    0.8881135    0.7762270 1987.0000000    0.0000000
range(hcrc$Age)
## [1] 17 30

Interpretation

Men are relatively less likely to use UM than women. When the hesitation marker is located at the beginning or the end of a phrase, it is more likely that the hesitation marker is equal to UM. While the direction of the age effect is as predicted, it is not significant. The likely reason for this is the relatively small age range in this data set.

Additional models

hcrc2 = unique(hcrc[,c("SpeakerID","Age","Age.z","Sex","nWords","nUM","nUH")])

dim(hcrc2)
## [1] 64  7
# average proportion UM as opposed to UH per speaker
mean(hcrc2$nUM / (hcrc2$nUM + hcrc2$nUH), na.rm=T)
## [1] 0.5716515
# percentage UM of all words
mean(hcrc2$nUM / hcrc2$nWords)
## [1] 0.008127415
# percentage UH of all words
mean(hcrc2$nUH / hcrc2$nWords)
## [1] 0.005806336
# Older people and men are relatively less likely to use UM
# (note, however, the limited age range in this dataset)
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1|SpeakerID), data=hcrc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Sex + Age.z + (1 | SpeakerID)
##    Data: hcrc2
## 
##      AIC      BIC   logLik deviance df.resid 
##    483.4    492.0   -237.7    475.4       60 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.64297 -0.32997  0.03752  0.18109  0.57951 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SpeakerID (Intercept) 0.6941   0.8331  
## Number of obs: 64, groups:  SpeakerID, 64
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.741359   0.002803 -1691.7   <2e-16 ***
## Sexm        -1.027418   0.002803  -366.6   <2e-16 ***
## Age.z       -0.358405   0.002802  -127.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) Sexm 
## Sexm  0.001       
## Age.z 0.001  0.001
## convergence code: 0
## Model failed to converge with max|grad| = 0.0252223 (tol = 0.001, component 1)
# Men are more likely to use UH than women, but older men are less likely to use UH, 
# whereas older women are relatively more likely to use UH
# (note, however, the limited age range in this dataset)
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1|SpeakerID), data=hcrc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Sex + Sex:Age.z + (1 | SpeakerID)
##    Data: hcrc2
## 
##      AIC      BIC   logLik deviance df.resid 
##    446.2    457.0   -218.1    436.2       59 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.21354 -0.46214  0.03552  0.23155  0.57126 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SpeakerID (Intercept) 0.7567   0.8699  
## Number of obs: 64, groups:  SpeakerID, 64
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.124447   0.002195 -2790.0   <2e-16 ***
## Sexm         0.984824   0.002193   449.1   <2e-16 ***
## Sexf:Age.z   0.144781   0.002193    66.0   <2e-16 ***
## Sexm:Age.z  -0.125410   0.002193   -57.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) Sexm  Sxf:A.
## Sexm       0.000              
## Sexf:Age.z 0.000  0.000       
## Sexm:Age.z 0.000  0.000 0.000 
## convergence code: 0
## Model failed to converge with max|grad| = 0.0322403 (tol = 0.001, component 1)
# Older people are less likely to use either UH or UM
# (note, however, the limited age range in this dataset)
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1|SpeakerID), data=hcrc2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH + nUM, nWords - nUH - nUM) ~ Sex + Age.z + (1 | SpeakerID)
##    Data: hcrc2
## 
##      AIC      BIC   logLik deviance df.resid 
##    539.7    548.4   -265.9    531.7       60 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.01320 -0.28313 -0.02075  0.20122  0.46859 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SpeakerID (Intercept) 0.3825   0.6185  
## Number of obs: 64, groups:  SpeakerID, 64
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.4138     0.1160  -38.05   <2e-16 ***
## Sexm         -0.1133     0.1643   -0.69   0.4905    
## Age.z        -0.1965     0.0841   -2.34   0.0195 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) Sexm  
## Sexm  -0.704       
## Age.z -0.007  0.028

German: Forschungs- und Lehrkorpus Gesprochenes Deutsch

download.file('http://www.let.rug.nl/wieling/uhum/German/ger.rda','ger.rda')
load('ger.rda')

dim(ger)
## [1] 16221     9
str(ger)
## 'data.frame':    16221 obs. of  9 variables:
##  $ Speaker    : Factor w/ 238 levels "FOLK_S_00027",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BirthYear  : int  1967 1967 1967 1967 1967 1967 1967 1967 1967 1967 ...
##  $ Gender     : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Interview  : Factor w/ 234 levels "FOLK_E_00002_SE_01_T_01_DF_01",..: 3 3 3 3 3 3 4 4 4 4 ...
##  $ Variant    : Factor w/ 4 levels "äh","ähm","öh",..: 1 1 2 1 1 1 1 1 1 1 ...
##  $ UM         : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ nUM        : num  9 9 9 9 9 9 9 9 9 9 ...
##  $ nUH        : num  71 71 71 71 71 71 71 71 71 71 ...
##  $ BirthYear.z: num  -0.868 -0.868 -0.868 -0.868 -0.868 ...
colnames(ger)
## [1] "Speaker"     "BirthYear"   "Gender"      "Interview"   "Variant"    
## [6] "UM"          "nUM"         "nUH"         "BirthYear.z"
  • Speaker – ID of the speaker
  • BirthYear – Year of birth of the speaker (in years)
  • Gender – Gender of the speaker (F: Female, M: Male)
  • Interview – ID of the interview
  • Variant – Word: äh, ähm, öh, or öhm
  • UM – A binary variable indicating if the word is equal to UM (1: ähm and öhm) or UH (0: äh and öh)
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • BirthYear.z – Z-transformation of BirthYear
visualizeUHUM(ger,"Speaker","BirthYear","Gender","German")

Final model

# takes about 40 seconds
m <- glmer(UM ~ BirthYear.z + Gender + (1|Speaker) + (0+BirthYear.z|Interview) +
                (1+Gender|Interview), family='binomial', data=ger)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: UM ~ BirthYear.z + Gender + (1 | Speaker) + (0 + BirthYear.z |  
##     Interview) + (1 + Gender | Interview)
##    Data: ger
## 
##      AIC      BIC   logLik deviance df.resid 
##  17299.0  17360.6  -8641.5  17283.0    16213 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6928 -0.6493 -0.2731  0.6959  6.1829 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev. Corr 
##  Speaker     (Intercept) 0.92586  0.9622        
##  Interview   BirthYear.z 0.02069  0.1438        
##  Interview.1 (Intercept) 0.07078  0.2660        
##              GenderM     0.41934  0.6476   -0.80
## Number of obs: 16221, groups:  Speaker, 238; Interview, 234
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.12511    0.10850   1.153  0.24889    
## BirthYear.z  0.94290    0.08197  11.503  < 2e-16 ***
## GenderM     -0.43433    0.16102  -2.697  0.00699 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) BrthY.
## BirthYear.z -0.047       
## GenderM     -0.688  0.083
# model performance
somers2(fitted(m), ger$UM)
##            C          Dxy            n      Missing 
## 8.163982e-01 6.327963e-01 1.622100e+04 0.000000e+00

Interpretation

Older people (i.e. with a lower year of birth) and men are relatively less likely to use UM than younger people and women.

Additional information

ger2 = unique(ger[,c("Speaker","BirthYear","Gender","BirthYear.z","nUM","nUH")])

dim(ger2)
## [1] 238   6
# average proportion UM as opposed to UH per speaker
mean(ger2$nUM / (ger2$nUM + ger2$nUH), na.rm=T)
## [1] 0.501731
# Note that the number of words per speaker was not available in this dataset, 
# so additional models (assessing the relative frequency of hesitation markers
# with respect to all other words) are not possible. 

Norwegian: Nordic Dialect Corpus and Syntax Database

download.file('http://www.let.rug.nl/wieling/uhum/Norwegian/nor.rda','nor.rda')
load('nor.rda')

dim(nor)
## [1] 47604    13
table(nor$AgeGroup,nor$RecordingYear) # unequal distribution
##        
##         1951 1956 1958 1959 1960 1962 1963 1964 1965 1967 1968 1969 1970
##   Old    138  263  159  162  128   29  119    8   88  452 1312  987  220
##   Young    0    0    0    0    0    0    0    0    0    0   62    0    0
##        
##         1971 1972 1973 1974 1975 1976 1978 1979 1980 1984 2006 2007 2008
##   Old    268  185  799  509  141  191  198 1234   22   21  867 3147 9100
##   Young    0    0    0    0    0    3    0    0    0    0   84 1996 6455
##        
##         2009 2010 2011 2012
##   Old   6099 5372  332   81
##   Young 3634 2154  493   75
str(nor)
## 'data.frame':    47604 obs. of  13 variables:
##  $ Speaker        : Factor w/ 554 levels "aal_01um","aal_02uk",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AgeGroup       : Factor w/ 2 levels "Old","Young": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Gender         : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Region         : Factor w/ 5 levels "Nord-Norge","Østlandet",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Area           : Factor w/ 18 levels "Akershus","Aust-Agder",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Place          : Factor w/ 163 levels "Ål","Alvdal",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ RecordingYear  : int  2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
##  $ Form           : Factor w/ 9 levels "e","E","em","EM",..: 1 1 3 1 3 1 1 1 6 1 ...
##  $ UM             : int  0 0 1 0 1 0 0 0 1 0 ...
##  $ nWords         : int  3824 3824 3824 3824 3824 3824 3824 3824 3824 3824 ...
##  $ nUM            : int  36 36 36 36 36 36 36 36 36 36 ...
##  $ nUH            : num  101 101 101 101 101 101 101 101 101 101 ...
##  $ RecordingYear.z: num  0.484 0.484 0.484 0.484 0.484 ...
colnames(nor)
##  [1] "Speaker"         "AgeGroup"        "Gender"         
##  [4] "Region"          "Area"            "Place"          
##  [7] "RecordingYear"   "Form"            "UM"             
## [10] "nWords"          "nUM"             "nUH"            
## [13] "RecordingYear.z"
  • Speaker – ID of the speaker
  • AgeGroup – Age group of the speaker (Old: 50+, Young: 18-30)
  • Gender – Gender of the speaker (Female, Male)
  • Region – Region from which the speaker originates in Norway
  • Area – Area within region from which the speaker originates
  • Place – Place from which the speaker originates
  • RecordingYear – Year of recording
  • Form – Form of hesitation marker: ‘e’, ‘E’, ‘em’, ‘EM’, ‘h-e’, ‘m’, ‘M’, ‘m-m’,‘m_m’
  • UM – A binary variable indicating if the word is equal to UM (1: ‘em’, ‘EM’, ‘m’, ‘M’, ‘m-m’, ‘m_m’) or UH (‘e’, ‘E’, ‘h-e’)
  • nWords – Total number of words per speaker
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • RecordingYear.z – Z-transformation of RecordingYear
n1 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian")
n2 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian","umwords")
n3 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian","uhwords")
n4 = visualizeUHUM(nor,"Speaker","AgeGroup","Gender","Norwegian","uhmwords")
multiplot(n1,n2,n4,n3,cols=2)

n5 = visualizeUHUM(nor,"Speaker","RecordingYear","Gender","Norwegian")
n5

Final model

# takes about 30 seconds
m <- glmer(UM ~ AgeGroup + Gender + RecordingYear.z + (1|Speaker), family='binomial', data=nor)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: UM ~ AgeGroup + Gender + RecordingYear.z + (1 | Speaker)
##    Data: nor
## 
##      AIC      BIC   logLik deviance df.resid 
##  32281.3  32325.1 -16135.6  32271.3    47582 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0661 -0.3865 -0.2880 -0.1800  8.2064 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  Speaker (Intercept) 0.8291   0.9106  
## Number of obs: 47587, groups:  Speaker, 553
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.42837    0.07829 -31.019  < 2e-16 ***
## AgeGroupYoung    0.64721    0.09546   6.780 1.20e-11 ***
## GenderMale      -0.23132    0.09071  -2.550   0.0108 *  
## RecordingYear.z  0.34890    0.05869   5.945 2.77e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) AgGrpY GndrMl
## AgeGroupYng -0.485              
## GenderMale  -0.646  0.013       
## RecrdngYr.z -0.082 -0.318  0.156
# model performance
somers2(fitted(m), nor[!is.na(nor$RecordingYear),]$UM)
##            C          Dxy            n      Missing 
## 7.582430e-01 5.164861e-01 4.758700e+04 0.000000e+00

Interpretation

Older people and men are relatively less likely to use UM than younger people and women. People recorded in more recent years are more likely to use UM than people recorded earlier.

Additional models

nor2 = unique(nor[,c("Speaker","AgeGroup","Gender","RecordingYear","RecordingYear.z","nWords","nUM","nUH")])

dim(nor2)
## [1] 554   8
# average proportion UM as opposed to UH per speaker
mean(nor2$nUM / (nor2$nUM + nor2$nUH), na.rm=T)
## [1] 0.1285163
# percentage UM of all words
mean(nor2$nUM / nor2$nWords)
## [1] 0.002645844
# percentage UH of all words
mean(nor2$nUH / nor2$nWords)
## [1] 0.01885864
# Younger people are relatively more likely than older people to use UM, older people (but not younger people)
# who were more recently recorded are relatively more likely to use UM
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Gender + AgeGroup + AgeGroup:RecordingYear.z + (1|Speaker), data=nor2,
                     family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUM, nWords - nUM) ~ Gender + AgeGroup + AgeGroup:RecordingYear.z +  
##     (1 | Speaker)
##    Data: nor2
## 
##      AIC      BIC   logLik deviance df.resid 
##   3541.3   3567.2  -1764.7   3529.3      547 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.43065 -0.35547  0.00705  0.18730  1.67140 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  Speaker (Intercept) 0.8852   0.9408  
## Number of obs: 553, groups:  Speaker, 553
## 
## Fixed effects:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -6.55851    0.07986  -82.13  < 2e-16 ***
## GenderMale                     0.03070    0.09214    0.33 0.738992    
## AgeGroupYoung                  0.73395    0.19055    3.85 0.000117 ***
## AgeGroupOld:RecordingYear.z    0.38582    0.06039    6.39 1.67e-10 ***
## AgeGroupYoung:RecordingYear.z -0.49511    0.33593   -1.47 0.140524    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrMl AgGrpY AGO:RY
## GenderMale  -0.641                     
## AgeGroupYng -0.240  0.002              
## AgGrpOl:RY. -0.077  0.151 -0.014       
## AgGrpYn:RY. -0.024  0.033 -0.876  0.007
# Men and older people are relatively more likely to use UH
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Gender + AgeGroup + Gender + RecordingYear.z + (1|Speaker), data=nor2,
                     family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH, nWords - nUH) ~ Gender + AgeGroup + Gender + RecordingYear.z +  
##     (1 | Speaker)
##    Data: nor2
## 
##      AIC      BIC   logLik deviance df.resid 
##   5389.0   5410.5  -2689.5   5379.0      548 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.60536 -0.16780  0.00832  0.12258  0.87358 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  Speaker (Intercept) 0.3365   0.5801  
## Number of obs: 553, groups:  Speaker, 553
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -4.14330    0.04423  -93.67  < 2e-16 ***
## GenderMale       0.27360    0.05232    5.23 1.70e-07 ***
## AgeGroupYoung   -0.33226    0.05677   -5.85 4.82e-09 ***
## RecordingYear.z  0.02952    0.02921    1.01    0.312    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrMl AgGrpY
## GenderMale  -0.648              
## AgeGroupYng -0.493  0.003       
## RecrdngYr.z  0.052  0.157 -0.370
# Men and older people are also relatively more likely to either use UM or UH 
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + AgeGroup + RecordingYear.z + (1|Speaker), 
                       data=nor2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + AgeGroup + RecordingYear.z +  
##     (1 | Speaker)
##    Data: nor2
## 
##      AIC      BIC   logLik deviance df.resid 
##   5516.4   5538.0  -2753.2   5506.4      548 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.72681 -0.15603  0.01666  0.12143  0.91305 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  Speaker (Intercept) 0.3209   0.5665  
## Number of obs: 553, groups:  Speaker, 553
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -4.01489    0.04308  -93.19  < 2e-16 ***
## GenderMale       0.24686    0.05092    4.85 1.25e-06 ***
## AgeGroupYoung   -0.24269    0.05522   -4.40 1.11e-05 ***
## RecordingYear.z  0.05309    0.02850    1.86   0.0625 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) GndrMl AgGrpY
## GenderMale  -0.647              
## AgeGroupYng -0.495  0.005       
## RecrdngYr.z  0.052  0.157 -0.370

Danish/Faroese: Faroese Danish Corpus Hamburg

download.file('http://www.let.rug.nl/wieling/uhum/Danish-Faroese/danfar.rda','danfar.rda')
load('danfar.rda')

dim(danfar)
## [1] 4504   17
str(danfar)
## 'data.frame':    4504 obs. of  17 variables:
##  $ speaker  : Factor w/ 57 levels "Interviewer2",..: 49 49 49 49 49 49 49 49 49 49 ...
##  $ age      : num  26 26 26 26 26 26 26 26 26 26 ...
##  $ sex      : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 1 1 ...
##  $ lang     : Factor w/ 2 levels "dan","far": 1 1 1 1 1 1 1 1 1 1 ...
##  $ word     : Factor w/ 50 levels "Eeeeh","eeeehh",..: 42 42 42 42 42 45 42 42 42 42 ...
##  $ type     : Factor w/ 2 levels "uh","uhm": 1 1 1 1 1 2 1 1 1 1 ...
##  $ UM       : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ nWords   : num  25462 25462 25462 25462 25462 ...
##  $ nWordsDan: num  25462 25462 25462 25462 25462 ...
##  $ nWordsFar: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ nUM      : num  73 73 73 73 73 73 73 73 73 73 ...
##  $ nUMDan   : num  73 73 73 73 73 73 73 73 73 73 ...
##  $ nUMFar   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ nUH      : num  284 284 284 284 284 284 284 284 284 284 ...
##  $ nUHDan   : num  284 284 284 284 284 284 284 284 284 284 ...
##  $ nUHFar   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age.z    : num  -0.794 -0.794 -0.794 -0.794 -0.794 ...
colnames(danfar)
##  [1] "speaker"   "age"       "sex"       "lang"      "word"     
##  [6] "type"      "UM"        "nWords"    "nWordsDan" "nWordsFar"
## [11] "nUM"       "nUMDan"    "nUMFar"    "nUH"       "nUHDan"   
## [16] "nUHFar"    "age.z"
  • speaker – ID of the speaker
  • age – age of the speaker
  • sex – Gender of the speaker (f: female, m: male)
  • lang – language of the speaker (dan: danish, far: faroese)
  • word – hesitation marker (50 variants)
  • type – hesitation marker classification: uh or uhm
  • UM – A binary variable indicating if the word is equal to UM (words including an ‘m’: 1), or UH (words which do not include an ‘m’: 0)
  • nWords – Total number of words per speaker
  • nWordsDan – Total number of words per speaker in Danish
  • nWordsFar – Total number of words per speaker in Faroese
  • nUM – Total number of UMs per speaker
  • nUMDan – Total number of UMs per speaker in Danish
  • nUMFar – Total number of UMs per speaker in Faroese
  • nUH – Total number of UHs per speaker
  • nUHDan – Total number of UHs per speaker in Danish
  • nUHFar – Total number of UHs per speaker in Faroese
  • age.z – Z-transformation of age
# Danish/Faroese together
danfar$birthyear = 2007 - danfar$age # approximation as recording year ranges between 2005 and 2009
df1 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese")
df2 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese","umwords")
df3 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese","uhwords")
df4 = visualizeUHUM(danfar,"speaker","birthyear","sex","Danish/Faroese","uhmwords")
multiplot(df1,df2,df4,df3,cols=2)

# Danish only
dan = droplevels(danfar[danfar$lang == 'dan',])
d1 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish")
d2 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish","umwords",nwordVar='nWordsDan')
d3 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish","uhwords",nwordVar='nWordsDan')
d4 = visualizeUHUM(dan,"speaker","birthyear","sex","Danish","uhmwords",nwordVar='nWordsDan')
multiplot(d1,d2,d4,d3,cols=2)

# Faroese only
far = droplevels(danfar[danfar$lang == 'far',])
f1 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese")
f2 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese","umwords",nwordVar='nWordsFar')
f3 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese","uhwords",nwordVar='nWordsFar')
f4 = visualizeUHUM(far,"speaker","birthyear","sex","Faroese","uhmwords",nwordVar='nWordsFar')
multiplot(f1,f2,f4,f3,cols=2)

Final model

# takes about 30 seconds
m <- glmer(UM ~ sex + age.z:lang + (1+lang|speaker), family='binomial', data=danfar)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: UM ~ sex + age.z:lang + (1 + lang | speaker)
##    Data: danfar
## 
##      AIC      BIC   logLik deviance df.resid 
##   3940.2   3985.0  -1963.1   3926.2     4497 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1698 -0.4785 -0.3196 -0.1591  5.8215 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr
##  speaker (Intercept) 0.2412   0.4912       
##          langfar     4.5529   2.1337   0.27
## Number of obs: 4504, groups:  speaker, 57
## 
## Fixed effects:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.2888     0.1494  -8.624  < 2e-16 ***
## sexm           -0.5876     0.2008  -2.927  0.00342 ** 
## age.z:langdan  -0.4118     0.1010  -4.077 4.56e-05 ***
## age.z:langfar  -0.5937     0.4276  -1.388  0.16504    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) sexm   ag.z:lngd
## sexm        -0.335                 
## age.z:lngdn  0.202  0.125          
## age.z:lngfr -0.043 -0.086  0.169
# model performance
somers2(fitted(m), danfar$UM)
##            C          Dxy            n      Missing 
##    0.7750354    0.5500709 4504.0000000    0.0000000

Interpretation

Older people and men are relatively less likely to use UM than younger people and women. The age effect is greater in the Faroese language than in the Danish language.

Additional models

Danish and Faroese

danfar2 = unique(danfar[,c("speaker","age","age.z","sex","nWords","nUM","nUH")])
dim(danfar2) # note the small number of speakers
## [1] 57  7
# average proportion UM as opposed to UH per speaker
mean(danfar2$nUM / (danfar2$nUM + danfar2$nUH), na.rm=T)
## [1] 0.1652995
# percentage UM of all words
mean(danfar2$nUM / danfar2$nWords)
## [1] 0.001991513
# percentage UH of all words
mean(danfar2$nUH / danfar2$nWords)
## [1] 0.007876947
# Older people are less likely to use UM 
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ age.z + sex + (1|speaker), data=danfar2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ age.z + sex + (1 | speaker)
##    Data: danfar2
## 
##      AIC      BIC   logLik deviance df.resid 
##    403.1    411.3   -197.6    395.1       53 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.15879 -0.30769  0.04376  0.15900  0.52398 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 1.337    1.156   
## Number of obs: 57, groups:  speaker, 57
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -6.7956     0.2297 -29.587   <2e-16 ***
## age.z        -0.4186     0.1714  -2.442   0.0146 *  
## sexm         -0.3271     0.3376  -0.969   0.3325    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z  0.082       
## sexm  -0.663 -0.085
# Use of UH is not influenced by age or gender 
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ age.z + sex + (1|speaker), data=danfar2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ age.z + sex + (1 | speaker)
##    Data: danfar2
## 
##      AIC      BIC   logLik deviance df.resid 
##    559.8    568.0   -275.9    551.8       53 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -0.92994 -0.13737  0.00454  0.09219  0.19001 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.6666   0.8164  
## Number of obs: 57, groups:  speaker, 57
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.16269    0.15367  -33.60   <2e-16 ***
## age.z       -0.09431    0.11436   -0.82    0.410    
## sexm         0.04130    0.22512    0.18    0.854    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z  0.061       
## sexm  -0.683 -0.084
# Use of either UM or UH is not influenced by age or gender
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ age.z + sex + (1|speaker), data=danfar2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH + nUM, nWords - nUH - nUM) ~ age.z + sex + (1 | speaker)
##    Data: danfar2
## 
##      AIC      BIC   logLik deviance df.resid 
##    586.7    594.9   -289.4    578.7       53 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -0.94196 -0.09405 -0.00001  0.08330  0.15900 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.7473   0.8645  
## Number of obs: 57, groups:  speaker, 57
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.92061    0.16147 -30.474   <2e-16 ***
## age.z       -0.15661    0.11990  -1.306    0.191    
## sexm        -0.06841    0.23672  -0.289    0.773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z  0.066       
## sexm  -0.684 -0.091

Danish only

dan2 = unique(dan[,c("speaker","age","age.z","sex","nWordsDan","nUMDan","nUHDan")])
dim(dan2) # note the small number of speakers
## [1] 43  7
# average proportion UM as opposed to UH per speaker
mean(dan2$nUM / (dan2$nUM + dan2$nUH), na.rm=T)
## [1] 0.1952701
# percentage UM of all words
mean(dan2$nUM / dan2$nWords)
## [1] 0.003941396
# percentage UH of all words
mean(dan2$nUH / dan2$nWords)
## [1] 0.01605026
# Older people are less likely to use UM 
summary(mUM <- glmer(cbind(nUMDan, nWordsDan - nUMDan) ~ age.z + sex + (1|speaker), data=dan2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUMDan, nWordsDan - nUMDan) ~ age.z + sex + (1 | speaker)
##    Data: dan2
## 
##      AIC      BIC   logLik deviance df.resid 
##    300.8    307.8   -146.4    292.8       39 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0772 -0.3598  0.0117  0.2002  0.6453 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.4025   0.6344  
## Number of obs: 43, groups:  speaker, 43
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.7334     0.1475  -38.87  < 2e-16 ***
## age.z        -0.4804     0.1128   -4.26 2.04e-05 ***
## sexm         -0.2317     0.2210   -1.05    0.295    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z  0.029       
## sexm  -0.659  0.009
# Men are more likely to use UH 
summary(mUH <- glmer(cbind(nUHDan, nWordsDan - nUHDan) ~ age.z + sex + (1|speaker), data=dan2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUHDan, nWordsDan - nUHDan) ~ age.z + sex + (1 | speaker)
##    Data: dan2
## 
##      AIC      BIC   logLik deviance df.resid 
##    416.0    423.1   -204.0    408.0       39 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -0.72174 -0.20364 -0.01880  0.06879  0.39320 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.3193   0.5651  
## Number of obs: 43, groups:  speaker, 43
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.45102    0.12112  -36.75   <2e-16 ***
## age.z       -0.08576    0.09132   -0.94   0.3477    
## sexm         0.35532    0.18037    1.97   0.0488 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z -0.045       
## sexm  -0.670  0.012
# Older people tend to be less likely to use either UM or UH (marginally significant)
summary(mUHUM <- glmer(cbind(nUHDan + nUMDan, nWordsDan - nUHDan - nUMDan) ~ age.z + sex + (1|speaker), data=dan2, 
                       family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUHDan + nUMDan, nWordsDan - nUHDan - nUMDan) ~ age.z +  
##     sex + (1 | speaker)
##    Data: dan2
## 
##      AIC      BIC   logLik deviance df.resid 
##    434.3    441.3   -213.1    426.3       39 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -0.71377 -0.16113 -0.02137  0.07914  0.35917 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.3161   0.5622  
## Number of obs: 43, groups:  speaker, 43
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.15765    0.11929  -34.85   <2e-16 ***
## age.z       -0.17558    0.09021   -1.95   0.0516 .  
## sexm         0.22770    0.17829    1.28   0.2015    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z -0.041       
## sexm  -0.668  0.010

Faroese only

far2 = unique(far[,c("speaker","age","age.z","sex","nWordsFar","nUMFar","nUHFar")])
dim(far2) # note the small number of speakers)
## [1] 48  7
# average proportion UM as opposed to UH per speaker
mean(far2$nUM / (far2$nUM + far2$nUH), na.rm=T)
## [1] 0.09746815
# percentage UM of all words
mean(far2$nUM / far2$nWords)
## [1] 0.000429629
# percentage UH of all words
mean(far2$nUH / far2$nWords)
## [1] 0.002967625
# Older people are less likely to use UM 
summary(mUM <- glmer(cbind(nUMFar, nWordsFar - nUMFar) ~ age.z + sex + (1|speaker), data=far2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUMFar, nWordsFar - nUMFar) ~ age.z + sex + (1 | speaker)
##    Data: far2
## 
##      AIC      BIC   logLik deviance df.resid 
##    141.5    149.0    -66.8    133.5       44 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.5537 -0.4422 -0.1977  0.2227  1.5478 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 3.334    1.826   
## Number of obs: 48, groups:  speaker, 48
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -9.4549     0.6339 -14.915  < 2e-16 ***
## age.z        -1.1824     0.4452  -2.656  0.00791 ** 
## sexm         -0.4776     0.7286  -0.656  0.51210    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z  0.322       
## sexm  -0.567 -0.080
# Use of UH is not influenced by age or gender 
summary(mUH <- glmer(cbind(nUHFar, nWordsFar - nUHFar) ~ age.z + sex + (1|speaker), data=far2, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUHFar, nWordsFar - nUHFar) ~ age.z + sex + (1 | speaker)
##    Data: far2
## 
##      AIC      BIC   logLik deviance df.resid 
##    332.8    340.3   -162.4    324.8       44 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.12895 -0.26198 -0.03189  0.17136  0.64055 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.6112   0.7818  
## Number of obs: 48, groups:  speaker, 48
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.15937    0.18492  -33.31   <2e-16 ***
## age.z       -0.20662    0.13916   -1.48    0.138    
## sexm         0.03106    0.25274    0.12    0.902    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z  0.085       
## sexm  -0.728 -0.098
# Older people are relatively less likely to use either UM or UH 
summary(mUHUM <- glmer(cbind(nUHFar + nUMFar, nWordsFar - nUHFar - nUMFar) ~ age.z + sex + (1|speaker), data=far2, 
                       family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUHFar + nUMFar, nWordsFar - nUHFar - nUMFar) ~ age.z +  
##     sex + (1 | speaker)
##    Data: far2
## 
##      AIC      BIC   logLik deviance df.resid 
##    344.3    351.8   -168.2    336.3       44 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.17897 -0.19065 -0.01544  0.19102  0.66997 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  speaker (Intercept) 0.5977   0.7731  
## Number of obs: 48, groups:  speaker, 48
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.97027    0.18000  -33.17   <2e-16 ***
## age.z       -0.28502    0.13573   -2.10   0.0357 *  
## sexm        -0.07555    0.24781   -0.30   0.7605    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) age.z 
## age.z  0.105       
## sexm  -0.726 -0.112

Dutch: Corpus Gesproken Nederlands

download.file('http://www.let.rug.nl/wieling/uhum/Dutch/nl.rda','nl.rda')
load('nl.rda')

dim(nl)
## [1] 228619     44
str(nl)
## 'data.frame':    228619 obs. of  44 variables:
##  $ wordid         : Factor w/ 228619 levels "fn000001.12.7",..: 552 579 558 550 540 587 565 570 532 551 ...
##  $ interviewid    : Factor w/ 5181 levels "fn000001","fn000002",..: 15 19 15 15 14 19 15 16 12 15 ...
##  $ speakerid      : Factor w/ 3433 levels "N00003","N00005",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ word           : Factor w/ 2 levels "uh","uhm": 1 2 1 1 1 1 1 1 1 1 ...
##  $ prevword       : Factor w/ 15014 levels "'k","'m","'n",..: 12639 4766 8683 12246 5782 12480 9029 7 14823 2695 ...
##  $ prevPOS        : Factor w/ 11 levels "ADJ","BW","LID",..: 6 11 10 7 10 11 10 3 11 2 ...
##  $ nextword       : Factor w/ 22438 levels "'k","'m","'ns",..: 4006 5296 10294 5296 8552 4609 7 12769 11134 19103 ...
##  $ nextPOS        : Factor w/ 11 levels "ADJ","BW","LID",..: 1 8 4 8 10 4 3 4 8 6 ...
##  $ component      : Factor w/ 15 levels "comp-a","comp-b",..: 12 12 12 12 12 12 12 9 9 12 ...
##  $ phrasenumber   : int  27 2 30 22 9 4 6 4 8 27 ...
##  $ interviewlength: int  39 30 39 39 9 30 39 5 11 39 ...
##  $ relsentpos     : num  0.6923 0.0667 0.7692 0.5641 1 ...
##  $ position       : int  12 1 3 13 11 3 12 4 1 11 ...
##  $ phraselength   : int  19 9 13 31 21 22 23 12 9 19 ...
##  $ relpos         : num  0.632 0.111 0.231 0.419 0.524 ...
##  $ isfirst        : num  0 1 0 0 0 0 0 0 1 0 ...
##  $ islast         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ duration       : num  0.088 0.475 0.091 0.051 0.071 ...
##  $ pausebefore    : num  0.254 0.076 0 0.106 0 ...
##  $ pauseafter     : num  0.182 0.222 0.334 0.212 0 ...
##  $ recyear        : int  1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
##  $ preparedness   : int  3 3 3 3 3 3 3 1 1 3 ...
##  $ sex            : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ birthyear      : int  1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 ...
##  $ age            : int  22 22 22 22 22 22 22 22 22 22 ...
##  $ edulevel       : num  9 9 9 9 9 9 9 9 9 9 ...
##  $ wps            : num  3.49 3.49 3.49 3.49 3.49 ...
##  $ country        : Factor w/ 2 levels "N","V": 1 1 1 1 1 1 1 1 1 1 ...
##  $ UM             : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ nWords         : int  2235 2235 2235 2235 2235 2235 2235 2235 2235 2235 ...
##  $ nUM            : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ nUH            : num  65 65 65 65 65 65 65 65 65 65 ...
##  $ relsentpos.z   : num  0.632 -1.487 0.892 0.198 1.674 ...
##  $ relpos.z       : num  0.36973 -1.37034 -0.97029 -0.3398 0.00942 ...
##  $ duration.z     : num  -0.72 1.562 -0.702 -0.938 -0.82 ...
##  $ pausebefore.z  : num  0.148 -0.244 -0.411 -0.178 -0.411 ...
##  $ pauseafter.z   : num  -0.171 -0.09 0.135 -0.11 -0.537 ...
##  $ recyear.z      : num  -0.99 -0.99 -0.99 -0.99 -0.99 ...
##  $ preparedness.z : num  1.66 1.66 1.66 1.66 1.66 ...
##  $ birthyear.z    : num  1.16 1.16 1.16 1.16 1.16 ...
##  $ age.z          : num  -1.33 -1.33 -1.33 -1.33 -1.33 ...
##  $ edulevel.z     : num  0.0576 0.0576 0.0576 0.0576 0.0576 ...
##  $ nWords.z       : num  0.0382 0.0382 0.0382 0.0382 0.0382 ...
##  $ wps.z          : num  -0.0542 -0.0542 -0.0542 -0.0542 -0.0542 ...
colnames(nl)
##  [1] "wordid"          "interviewid"     "speakerid"      
##  [4] "word"            "prevword"        "prevPOS"        
##  [7] "nextword"        "nextPOS"         "component"      
## [10] "phrasenumber"    "interviewlength" "relsentpos"     
## [13] "position"        "phraselength"    "relpos"         
## [16] "isfirst"         "islast"          "duration"       
## [19] "pausebefore"     "pauseafter"      "recyear"        
## [22] "preparedness"    "sex"             "birthyear"      
## [25] "age"             "edulevel"        "wps"            
## [28] "country"         "UM"              "nWords"         
## [31] "nUM"             "nUH"             "relsentpos.z"   
## [34] "relpos.z"        "duration.z"      "pausebefore.z"  
## [37] "pauseafter.z"    "recyear.z"       "preparedness.z" 
## [40] "birthyear.z"     "age.z"           "edulevel.z"     
## [43] "nWords.z"        "wps.z"
# relative proportion of UM in the Netherlands
mean(na.exclude(nl[nl$country=='N',]$UM))
## [1] 0.08507495
# relative proportion of UM in Flanders
mean(na.exclude(nl[nl$country=='V',]$UM))
## [1] 0.1577117
# average duration of UM in the Netherlands
mean(na.exclude(nl[nl$country=='N' & nl$word=='uhm',]$duration))
## [1] 0.3073129
# average duration of UM in Flanders
mean(na.exclude(nl[nl$country=='V' & nl$word=='uhm',]$duration))
## [1] 0.5231677
  • wordid – ID of the word
  • interviewid – ID of the interview / audio sample
  • speakerid – ID of the speaker
  • word – Hesitation marker: ‘uh’ or ‘uhm’
  • prevword – Word before the hesitation marker
  • prevPOS – Part-of-speech tag of the word before the hesitation marker
  • nextword – Word after the hesitation marker
  • nextPOS – Part-of-speech tag of the word after the hesitation marker
  • component – The components of the Dutch corpus (comp-a until comp-o)
  • phrasenumber – Sequence number of the phrase in the interview / audio sample
  • interviewlength – Total number of the phrases in the interview / audio sample
  • relsentpos – Relative number of the phrase in the interview / audio sample
  • position – Position of the hesitation marker in the phrase
  • phraselength – Number of words in the phrase
  • relpos – Relative position of the hesitation marker in the phrase
  • isfirst – If the hesitation marker is the first word in the phrase (1) or not (0)
  • islast – If the hesitation marker is the last word in the phrase (1) or not (0)
  • duration – Duration in seconds of the hesitation marker
  • pausebefore – Duration of the pause after the word before the hesitation marker and the start of the hesitation marker
  • pauseafter – Duration of the pause after the hesitation marker and before the start of the next word
  • recyear – Year of recording of the interview / audio sample
  • preparedness – Degree of preparedness of the speaker for his/her speech (1: no preparation, 2: some preparation, 3: prepared)
  • sex – Gender of the speaker
  • birthyear – Year of birth of the speaker
  • age – Age of the speaker
  • edulevel – Education level of the speaker
  • wps – Words per second pronounced by the speaker (i.e. speed)
  • country – Country of the speaker (N: Netherlands, V: Vlaanderen, Flanders, the Dutch speaking part of Belgium)
  • UM – A binary variable indicating if the word is equal to UM (1: uhm) or UH (0: uh)
  • nWords – Total number of words pronounced in total by the speaker
  • nUM – Total number of UMs per speaker
  • nUH – Total number of UHs per speaker
  • .z – Z-transformation of the numerical predictors
n1 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch")
n2 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch","umwords")
n3 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch","uhwords")
n4 = visualizeUHUM(nl,"speakerid","birthyear","sex","Dutch","uhmwords")
multiplot(n1,n2,n4,n3,cols=2)

n5 = visualizeUHUM(nl,"speakerid","recyear","sex","Dutch")
n5

Final model

# duration: about 30 minutes
m <- glmer(UM ~ country + country:sex + country:age.z + preparedness.z + edulevel.z + isfirst + recyear.z + 
                islast + relpos.z + relsentpos.z  + duration.z + pausebefore.z + pauseafter.z + 
                (1|speakerid) + (1|interviewid), data=nl, family='binomial',
                control=glmerControl(optimizer="bobyqa"))

summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: UM ~ country + country:sex + country:age.z + preparedness.z +  
##     edulevel.z + isfirst + recyear.z + islast + relpos.z + relsentpos.z +  
##     duration.z + pausebefore.z + pauseafter.z + (1 | speakerid) +  
##     (1 | interviewid)
##    Data: nl
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  83530.9  83713.3 -41747.5  83494.9   185109 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2191.67    -0.26    -0.15    -0.08    33.82 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  interviewid (Intercept) 0.2112   0.4596  
##  speakerid   (Intercept) 1.2745   1.1289  
## Number of obs: 185127, groups:  interviewid, 4670; speakerid, 2574
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.676224   0.065678  -40.75  < 2e-16 ***
## countryV         -0.605820   0.092985   -6.52 7.26e-11 ***
## preparedness.z    0.277940   0.037512    7.41 1.27e-13 ***
## edulevel.z        0.147628   0.030573    4.83 1.37e-06 ***
## isfirst           0.512852   0.029647   17.30  < 2e-16 ***
## recyear.z         0.094139   0.025874    3.64 0.000274 ***
## islast            0.957705   0.039419   24.30  < 2e-16 ***
## relpos.z         -0.347087   0.013536  -25.64  < 2e-16 ***
## relsentpos.z     -0.069232   0.009602   -7.21 5.58e-13 ***
## duration.z        1.154341   0.010951  105.41  < 2e-16 ***
## pausebefore.z     0.170756   0.008172   20.90  < 2e-16 ***
## pauseafter.z      0.470273   0.008348   56.34  < 2e-16 ***
## countryN:sexmale -0.495872   0.077098   -6.43 1.26e-10 ***
## countryV:sexmale -0.890460   0.097126   -9.17  < 2e-16 ***
## countryN:age.z   -0.280348   0.035951   -7.80 6.28e-15 ***
## countryV:age.z   -0.623677   0.052028  -11.99  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cntryV prprd. edlvl. isfrst rcyr.z islast rlps.z rlsnt.
## countryV    -0.478                                                        
## preprdnss.z  0.493  0.060                                                 
## edulevel.z   0.051 -0.198 -0.170                                          
## isfirst     -0.037 -0.016 -0.014  0.003                                   
## recyear.z    0.010 -0.024  0.248  0.083 -0.004                            
## islast      -0.074  0.005  0.022  0.005 -0.300  0.001                     
## relpos.z     0.065 -0.004  0.002  0.000  0.413 -0.010 -0.696              
## relsentps.z -0.007 -0.004 -0.025  0.001  0.010 -0.005 -0.012 -0.001       
## duration.z  -0.033 -0.133  0.026 -0.004 -0.075  0.011 -0.002  0.002 -0.004
## pausebefr.z -0.004  0.014 -0.007 -0.001 -0.416  0.001  0.040 -0.040 -0.006
## pauseaftr.z -0.022  0.011  0.035  0.016  0.029  0.005 -0.102 -0.028 -0.018
## cntryN:sxml -0.661  0.434 -0.078 -0.105 -0.011  0.131  0.004 -0.003  0.002
## cntryV:sxml -0.074 -0.565 -0.127  0.000 -0.004  0.048 -0.001  0.003  0.003
## contryN:g.z  0.074 -0.075 -0.084 -0.034  0.010 -0.010  0.003  0.008 -0.004
## contryV:g.z -0.057  0.117 -0.120  0.028  0.008 -0.008  0.001  0.006 -0.005
##             drtn.z psbfr. psftr. cntrN: cntrV: cntN:.
## countryV                                             
## preprdnss.z                                          
## edulevel.z                                           
## isfirst                                              
## recyear.z                                            
## islast                                               
## relpos.z                                             
## relsentps.z                                          
## duration.z                                           
## pausebefr.z -0.027                                   
## pauseaftr.z  0.019  0.029                            
## cntryN:sxml -0.017 -0.002 -0.012                     
## cntryV:sxml -0.015 -0.001 -0.015  0.029              
## contryN:g.z -0.041  0.000  0.009 -0.135  0.014       
## contryV:g.z -0.041 -0.003 -0.006  0.014 -0.097  0.013
# model performance, m@frame contains all data with non-missing values
somers2(fitted(m), m@frame$UM)
##            C          Dxy            n      Missing 
## 9.094131e-01 8.188262e-01 1.851270e+05 0.000000e+00

Interpretation

Men, both in the Netherlands and in Flanders, are less likely than women to use UM (this gender effect is greatest in Flanders). Younger people, both in the Netherlands and in Flanders, are more likely to use UM than older people (this age effect is strongest in Flanders). People recorded later are more likely to prefer UM. While the model summary suggests (due to the significance of the country predictor) that women of average age in Flanders are less likely to use UM than those in the Netherlands, this is caused by the inclusion of the predictor modeling the duration of the hesitation marker (which is higher in Flanders). When duration is excluded from the model, women of average age in Flanders have a greater likelihood of using UM than those in the Netherlands. The other two subject-related predictors indicate that people with a higher education level have a greater likelihood of using UM than those with a lower education level. People who prepared what they were going to say (e.g., holding a speech versus spontaneous speech) were more likely to use UM than those with less preparation. The other predictors deal with possible different functions between UH and UM. When the position of the hesitation marker is at the beginning or at the end of the utterance, it is more likely to be UM rather than UH. Additionally, if the hesitation marker occurs later in the utterance it is less likely to be UM as opposed to UH. Whenever the utterance occurs later in the interview / audio recording, it is also less likely to contain UM. Finally, whenever the hesitation marker has longer duration, is preceded or followed by a longer pause (i.e. silence), it is more likely to be UM rather than UH.

Given the complexity of this model (the random-intercept-only model took about half an hour to fit, adding several random slopes would increase the required time to fit considerably), we fitted a series of models which only differed from the simple model above by the inclusion of a single random slope (per speaker or interview). In this way, we assessed if the inclusion of each predictor in the model as a (by-speaker or by-interview) random slope caused another predictor to become non-significant. The model reported above only contains those predictors which remained significant in all models. Note that there were a few additional interactions with country and non-speaker related predictors. These were not included in the model above as the direction of the effects was always the same in both countries and of secondary importance.

Additional models

nl2 = unique(nl[,c("speakerid","sex","birthyear","age","edulevel","wps","country","nWords","nUM","nUH","birthyear.z","age.z","edulevel.z","wps.z")])

dim(nl2)
## [1] 3727   14
# average proportion UM as opposed to UH per speaker
mean(nl2$nUM / (nl2$nUM + nl2$nUH), na.rm=T)
## [1] 0.1085622
# percentage UM of all words
mean(nl2$nUM / nl2$nWords)
## [1] 0.003704241
# percentage UH of all words
mean(nl2$nUH / nl2$nWords)
## [1] 0.03149794
# People from Flanders, men, older people and people who speak faster are less likely to use UM. 
# People with higher education are more likely to use UM. 
# (adding an interaction between country and sex or birthyear caused non-convergence, 
#  other interactions were not tested here)
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ country + sex + age.z + 
                                                edulevel.z + wps.z + (1|speakerid), 
                     data=nl2, family='binomial', control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ country + sex + age.z + edulevel.z +  
##     wps.z + (1 | speakerid)
##    Data: nl2
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  14800.0  14841.7  -7393.0  14786.0     2855 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7084 -0.4495 -0.0133  0.1839  4.9581 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  speakerid (Intercept) 1.29     1.136   
## Number of obs: 2862, groups:  speakerid, 2604
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.06992    0.04718 -128.67  < 2e-16 ***
## countryV    -0.27419    0.05827   -4.71 2.53e-06 ***
## sexmale     -0.31226    0.05478   -5.70 1.20e-08 ***
## age.z       -0.29043    0.02752  -10.55  < 2e-16 ***
## edulevel.z   0.15902    0.02789    5.70 1.18e-08 ***
## wps.z       -0.17578    0.03779   -4.65 3.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr) cntryV sexmal age.z  edlvl.
## countryV   -0.507                            
## sexmale    -0.614  0.001                     
## age.z       0.053  0.110 -0.129              
## edulevel.z  0.134 -0.165 -0.140 -0.023       
## wps.z      -0.238  0.281 -0.005  0.258  0.138
# Women in Flanders having an average age are less likely to use UH than women in the Netherlands of average age
# Men, both in Flanders and the Netherlands (more so in the former) are more likely to use UH than women.
# Older people in Flanders (but not in the Netherlands) are more likely to use UH. 
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ country + country:sex + country:age.z + 
                                                (1|speakerid), data=nl2, family='binomial', 
                     control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH, nWords - nUH) ~ country + country:sex + country:age.z +  
##     (1 | speakerid)
##    Data: nl2
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  28049.5  28091.9 -14017.8  28035.5     3154 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.70167 -0.10814  0.01579  0.10781  2.20660 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  speakerid (Intercept) 0.6359   0.7974  
## Number of obs: 3161, groups:  speakerid, 2867
## 
## Fixed effects:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.67763    0.03046 -120.75  < 2e-16 ***
## countryV         -0.75052    0.05119  -14.66  < 2e-16 ***
## countryN:sexmale  0.21013    0.04040    5.20 1.98e-07 ***
## countryV:sexmale  0.40182    0.05370    7.48 7.29e-14 ***
## countryN:age.z   -0.02619    0.01862   -1.41     0.16    
## countryV:age.z    0.14335    0.02590    5.53 3.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cntryV cntrN: cntrV: cntN:.
## countryV    -0.594                            
## cntryN:sxml -0.761  0.453                     
## cntryV:sxml  0.000 -0.620  0.000              
## contryN:g.z  0.109 -0.064 -0.155  0.000       
## contryV:g.z  0.001  0.083  0.000 -0.119  0.001
# People from Flanders of average age are less likely to use either UM or UH than people from the Netherlands
# of average age. Men are more likely to use eigher UM or UH than women. Older people in the Netherlands are
# less likely to use either UM or UH, while the pattern is reversed in Flanders. 
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ country + sex + 
                                                              country:age.z + (1|speakerid), 
                       data=nl2, family='binomial', control=glmerControl(optimizer="bobyqa")))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## cbind(nUH + nUM, nWords - nUH - nUM) ~ country + sex + country:age.z +  
##     (1 | speakerid)
##    Data: nl2
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  28721.1  28757.4 -14354.5  28709.1     3155 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.75429 -0.09778  0.01616  0.10274  2.05496 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  speakerid (Intercept) 0.6243   0.7901  
## Number of obs: 3161, groups:  speakerid, 2867
## 
## Fixed effects:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -3.57416    0.02669 -133.91  < 2e-16 ***
## countryV       -0.57363    0.03235  -17.73  < 2e-16 ***
## sexmale         0.21922    0.03182    6.89  5.6e-12 ***
## countryN:age.z -0.05379    0.01825   -2.95  0.00320 ** 
## countryV:age.z  0.08089    0.02519    3.21  0.00132 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cntryV sexmal cntN:.
## countryV    -0.429                     
## sexmale     -0.683 -0.015              
## contryN:g.z  0.079  0.008 -0.125       
## contryV:g.z  0.050  0.020 -0.073  0.010

Twitter: U.S. data

download.file('http://www.let.rug.nl/wieling/uhum/TwitterUS/twitterUS.rda','twitterUS.rda')
load('twitterUS.rda')

dim(twitterUSdat)
## [1] 25858     5
str(twitterUSdat)
## 'data.frame':    25858 obs. of  5 variables:
##  $ User  : Factor w/ 25858 levels "______Liz","______xMarie",..: 666 667 668 669 670 671 672 665 673 676 ...
##  $ Gender: Factor w/ 2 levels "F","M": 2 1 2 2 2 2 1 2 1 1 ...
##  $ nWords: int  1220 4041 8733 6511 1833 2418 11901 2960 5933 25096 ...
##  $ nUM   : int  1 2 6 0 0 0 2 1 1 6 ...
##  $ nUH   : int  0 0 1 4 1 1 1 1 0 7 ...
colnames(twitterUSdat)
## [1] "User"   "Gender" "nWords" "nUM"    "nUH"
  • User – Twitter userid (having produced at least 1000 words)
  • Gender – Gender of the Twitter user (F: female, M: male)
  • nWords – Total number of words per Twitter user
  • nUM – Total number of UMs per Twitter user
  • nUH – Total number of UHs per Twitter user
us1 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","umuh") 
us2 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","umwords") 
us3 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","uhwords") 
us4 = visualizeUHUMTwitter(twitterUSdat,"U.S. Twitter","uhmwords") 
multiplot(us1,us2,us4,us3,cols=2)

Final model

# takes about 30 seconds
m <- glmer(cbind(nUM,nUH) ~ Gender + (1|User), family='binomial', data=twitterUSdat)
summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nUH) ~ Gender + (1 | User)
##    Data: twitterUSdat
## 
##      AIC      BIC   logLik deviance df.resid 
##  56168.1  56192.5 -28081.0  56162.1    25855 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.9977 -0.5720  0.0330  0.4969  0.9076 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 6.256    2.501   
## Number of obs: 25858, groups:  User, 25858
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.42504    0.03085  13.778  < 2e-16 ***
## GenderM     -0.26477    0.04421  -5.989 2.11e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## GenderM -0.692
# model performance
somers2( fitted(m), (twitterUSdat$nUM > twitterUSdat$nUH)*1 )
##            C          Dxy            n      Missing 
## 9.999897e-01 9.999794e-01 2.585800e+04 0.000000e+00
somers2( fitted(m), twitterUSdat$nUM / (twitterUSdat$nUM + twitterUSdat$nUH) )
##            C          Dxy            n      Missing 
## 9.871825e-01 9.743650e-01 2.585800e+04 0.000000e+00

Interpretation

Men are relatively less likely to use UM than women.

Additional models

# average proportion UM as opposed to UH per speaker
mean(twitterUSdat$nUM / (twitterUSdat$nUM + twitterUSdat$nUH), na.rm=T)
## [1] 0.5334387
# percentage UM of all words
mean(twitterUSdat$nUM / twitterUSdat$nWords)
## [1] 0.0002527892
# percentage UH of all words
mean(twitterUSdat$nUH / twitterUSdat$nWords)
## [1] 0.0001878946
# Men are relatively less likely than women to use UM.
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Gender + (1|User), 
                     data=twitterUSdat,family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Gender + (1 | User)
##    Data: twitterUSdat
## 
##      AIC      BIC   logLik deviance df.resid 
##  85852.8  85877.3 -42923.4  85846.8    25855 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.36025 -0.52782  0.00306  0.50772  1.83745 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 1.346    1.16    
## Number of obs: 25858, groups:  User, 25858
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.07547    0.01455  -623.6   <2e-16 ***
## GenderM     -0.27112    0.02042   -13.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## GenderM -0.653
# Men are relatively less likely than women to use UH.
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Gender + (1|User), data=twitterUSdat,
                     family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Gender + (1 | User)
##    Data: twitterUSdat
## 
##      AIC      BIC   logLik deviance df.resid 
##  75758.6  75783.1 -37876.3  75752.6    25855 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6775 -0.5207 -0.1841  0.5154  2.3350 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 1.115    1.056   
## Number of obs: 25858, groups:  User, 25858
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.31611    0.01480  -629.5  < 2e-16 ***
## GenderM     -0.14096    0.02012    -7.0 2.43e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## GenderM -0.660
# Men are relatively less likely than women to use either UM or UH.
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + (1|User), 
                       data=twitterUSdat, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + (1 | User)
##    Data: twitterUSdat
## 
##      AIC      BIC   logLik deviance df.resid 
## 106780.8 106805.3 -53387.4 106774.8    25855 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2491 -0.3029  0.1101  0.5508  2.7342 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 0.5664   0.7526  
## Number of obs: 25858, groups:  User, 25858
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.204205   0.009272  -884.8   <2e-16 ***
## GenderM     -0.203652   0.013438   -15.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## GenderM -0.686

Twitter: Dutch data

download.file('http://www.let.rug.nl/wieling/uhum/TwitterNL/twitterNL.rda','twitterNL.rda')
load('twitterNL.rda') # Dutch Twitter data collected between 2011 and 2014

dim(twitterNL)
## [1] 38651     7
str(twitterNL)
## 'data.frame':    38651 obs. of  7 variables:
##  $ User       : Factor w/ 189907 levels "_______1992",..: 155487 62707 111648 14 15 111651 155492 111654 62710 26 ...
##  $ BirthYear  : num  1998 1993 1995 1990 1991 ...
##  $ Gender     : Factor w/ 2 levels "female","male": NA 2 NA 2 NA 2 1 NA NA NA ...
##  $ nWords     : int  2148 17699 11049 8854 182024 1839 1590 6271 1706 3120 ...
##  $ nUM        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nUH        : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ BirthYear.z: num  0.828 0.488 0.624 0.284 0.352 ...
colnames(twitterNL)
## [1] "User"        "BirthYear"   "Gender"      "nWords"      "nUM"        
## [6] "nUH"         "BirthYear.z"
  • User – Twitter userid (having produced at least 1000 words)
  • BirthYear – Year of birth of the Twitter user
  • Gender – Gender of the Twitter user (female, male)
  • nWords – Total number of words per Twitter user
  • nUM – Total number of UMs per Twitter user (i.e. all “uhm” and “ehm” occurrences)
  • nUH – Total number of UHs per Twitter user (i.e. all “uh” occurrences, “eh” is not included as it frequently is used as a substitute for the call for attention “hé”)
  • BirthYear.z – Z-transformation of BirthYear
tw1 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","umuh","nUM") 
tw2 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","umwords","nUM")
tw3 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","uhwords","nUM") 
tw4 = visualizeUHUM(twitterNL,"User","BirthYear","Gender","Dutch Twitter","uhmwords","nUM")
multiplot(tw1,tw2,tw4,tw3,cols=2)

Final model

# takes about 1 minute: Gender not significant
m <- glmer(cbind(nUM,nUH) ~ Gender + BirthYear.z + (1|User), data=twitterNL, family='binomial')

summary(m)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nUH) ~ Gender + BirthYear.z + (1 | User)
##    Data: twitterNL
## 
##      AIC      BIC   logLik deviance df.resid 
##  13453.9  13484.2  -6723.0  13445.9    14377 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.8854  0.0000  0.0000  0.0000  0.8053 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 7.258    2.694   
## Number of obs: 14381, groups:  User, 14381
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.12375    0.08952  12.553   <2e-16 ***
## Gendermale  -0.15102    0.09870  -1.530    0.126    
## BirthYear.z  0.85995    0.09394   9.154   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Gndrml
## Gendermale  -0.606       
## BirthYear.z -0.487  0.015
# model performance
twnl = twitterNL[!is.na(twitterNL$Gender) & !is.na(twitterNL$BirthYear.z),] # exclude missing values
somers2( fitted(m), (twnl$nUM > twnl$nUH)*1 )
##            C          Dxy            n      Missing 
## 9.024339e-01 8.048679e-01 1.438100e+04 0.000000e+00
somers2( fitted(m), twnl$nUM / (twnl$nUM + twnl$nUH) )
##            C          Dxy            n      Missing 
##    0.9749681    0.9499362 5321.0000000 9060.0000000

Interpretation

Younger people are relatively more likely to use UM than older people. Men tend to be relatively less likely to use UM than women, though not significantly so.

Additional models

# average proportion UM as opposed to UH per speaker
mean(twitterNL$nUM / (twitterNL$nUM + twitterNL$nUH), na.rm=T)
## [1] 0.65184
# percentage UM of all words
mean(twitterNL$nUM / twitterNL$nWords)
## [1] 0.0001069555
# percentage UH of all words
mean(twitterNL$nUH / twitterNL$nWords)
## [1] 5.501112e-05
# Men are relatively less likely than women to use UM and younger people
# are relatively more likely to use UM than older people. 
summary(mUM <- glmer(cbind(nUM, nWords - nUM) ~ Gender + BirthYear.z + (1|User), 
                     data=twitterNL,family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUM, nWords - nUM) ~ Gender + BirthYear.z + (1 | User)
##    Data: twitterNL
## 
##      AIC      BIC   logLik deviance df.resid 
##  30198.2  30228.5 -15095.1  30190.2    14377 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.9744 -0.3544 -0.2373  0.1244  3.1627 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 2.527    1.59    
## Number of obs: 14381, groups:  User, 14381
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.32794    0.04190 -246.51   <2e-16 ***
## Gendermale   -0.36494    0.04157   -8.78   <2e-16 ***
## BirthYear.z   0.81890    0.03672   22.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Gndrml
## Gendermale  -0.525       
## BirthYear.z -0.479  0.004
# Men are relatively less likely than women to use UH and younger people
# are relatively more likely to use UH than older people. 
summary(mUH <- glmer(cbind(nUH, nWords - nUH) ~ Gender + BirthYear.z + (1|User), 
                     data=twitterNL, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH, nWords - nUH) ~ Gender + BirthYear.z + (1 | User)
##    Data: twitterNL
## 
##      AIC      BIC   logLik deviance df.resid 
##  20115.0  20145.3 -10053.5  20107.0    14377 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.6742 -0.2228 -0.1504 -0.1000  2.5086 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 4.553    2.134   
## Number of obs: 14381, groups:  User, 14381
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -11.51287    0.06698 -171.88  < 2e-16 ***
## Gendermale   -0.26664    0.05880   -4.53 5.77e-06 ***
## BirthYear.z   0.38734    0.04561    8.49  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Gndrml
## Gendermale  -0.477       
## BirthYear.z -0.328  0.016
# Men are relatively less likely than women to use either UM or UH and younger people
# are relatively more likely to use either UM or UH than older people. 
summary(mUHUM <- glmer(cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + BirthYear.z + 
                       (1|User), data=twitterNL, family='binomial'))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: cbind(nUH + nUM, nWords - nUH - nUM) ~ Gender + BirthYear.z +  
##     (1 | User)
##    Data: twitterNL
## 
##      AIC      BIC   logLik deviance df.resid 
##  37874.3  37904.6 -18933.2  37866.3    14377 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1563 -0.4324 -0.2895  0.2192  5.0039 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  User   (Intercept) 1.869    1.367   
## Number of obs: 14381, groups:  User, 14381
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.57543    0.03196 -299.63   <2e-16 ***
## Gendermale  -0.32884    0.03407   -9.65   <2e-16 ***
## BirthYear.z  0.62213    0.02799   22.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Gndrml
## Gendermale  -0.582       
## BirthYear.z -0.442  0.011

Replication

To replicate the analysis presented above, you can just copy the following lines to the most recent version of R. Please note you also need to install Pandoc.

packages <- c("lme4","Hmisc","ggplot2","rmarkdown")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())))  
}
download.file('http://www.let.rug.nl/wieling/uhum/analysis.Rmd', 'analysis.Rmd')
library(rmarkdown)
render('analysis.Rmd') # generates html file with results
browseURL(paste('file://', file.path(getwd(),'analysis.html'), sep='')) # shows result