XDL-ACLEW/ACLEW-CDSquantity-randomsample-SM.Rmd at main · aclew/XDL-ACLEW · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title             : "Supplementary Materials: A cross-linguistic examination of young children's everyday language experiences"
shorttitle        : "SM: Cross-linguistic everyday language experiences"

author:
  - name          : "John Bunce"
    affiliation   : "1,2"
    corresponding : yes    # Define only one corresponding author
    address       : "California State University East Bay, Department of Human Development and Women's Studies, 25800 Carlos Bee Blvd, Hayward, CA 94542"
    email         : "john.bunce@csueastbay.edu"
  - name          : "Melanie Soderstrom"
    affiliation   : "2"
  - name          : "Elika Bergelson"
    affiliation   : "3"
  - name          : "Celia Rosemberg"
    affiliation   : "4"
  - name          : "Alejandra Stein"
    affiliation   : "4"
  - name          : "Florencia Alam"
    affiliation   : "4"
  - name          : "Maia Julieta Migdalek"
    affiliation   : "4"
  - name          : "Marisa Casillas"
    affiliation   : "5,6"

affiliation:
  - id            : "1"
    institution   : "California State University, East Bay"
  - id            : "2"
    institution   : "University of Manitoba"
  - id            : "3"
    institution   : "Harvard University"
  - id            : "4"
    institution   : "Centro Interdisciplinario de Investigaciones en Psicología Matemática y Experimental - CONICET"
  - id            : "5"
    institution   : "University of Chicago"
  - id            : "6"
    institution   : "Max Planck Institute for Psycholinguistics"

authornote: |
  This research was supported by the Social Sciences and Humanities Research Council of Canada (435-2015-0628, 869-2016-0003) and by Natural Sciences and Engineering Research Council of Canada (501769-2016-RGPDD) to Melanie Soderstrom; by the National Endowment for the Humanities (HJ-253479-17), National Institutes of Health Grant DP5-OD019812, and National Science Foundation BCS-1844710 to Elika Bergelson; a CONICET grant, PIP 80/2015, and a MINCyT grant, PICT 3327/2014 to Celia Rosemberg; and an NWO Veni Innovational Scheme Grant (275-89-033) to Marisa Casillas. We thank Anne Warlaumont and Caroline Rowland for contributing their datasets to this project (and for helpful feedback on this manuscript). Finally, we thank the families who participated in the recordings that made this research possible.

bibliography      : ["ACLEW-CDSquantity.bib"]

floatsintext      : yes
figurelist        : yes
tablelist         : yes
numbersections    : yes
footnotelist      : no
linenumbers       : yes
mask              : no
draft             : no
toc               : yes

documentclass     : "apa6"
classoption       : "man"
output            : papaja::apa6_word
---

```{r setup, include = FALSE}
source("0_setup.R")
library(ggridges)
quantity.rand <- read_csv(paste0(
  processed.data.path, "minimal-quantity.rand.csv"))
quantity.rand.bychild <- read_csv(paste0(
  processed.data.path, "minimal-quantity.rand.bychild.csv"))
quantity.rand.st <- read_csv(paste0(
  processed.data.path, "minimal-quantity.rand.st.csv"))
```

```{r analysis-preferences, include = FALSE}
# Seed for random number generation
set.seed(42)
knitr::opts_chunk$set(cache.extra = knitr::rand_seed)
```

Before describing the contents of these Supplementary Materials, we remind readers that our analyses are centered on the dependent variables of minutes-per-hour rate of target-child-directed speech (TCDS), other-child-directed speech (OCDS), all child-directed speech (CDS, derived from TCDS + OCDS), and adult-directed speech (ADS). Please see the main manuscript for reasoning and details. Below we first briefly describe the contents of each sub-section:

*Section 1.* For those interested in knowing about the _total quantity_ of child-directed speech in these children's environments (i.e., all input that is designed for children addressees; TCDS + ODS), the first section includes analyses of all child-directed speech (CDS) that parallel what is reported in the main text for target-child-directed speech (TCDS).

*Section 2.* The second section gives expanded analyses on the number of talkers present. The main manuscript demonstrates strong effects of the number of talkers present in a given clip and motivates the inclusion of number of talkers in the primary statistical models. This section gives more descriptive information about number of talkers typical in each language group and preliminarily explores how differences in typical number of talkers present may account for main-text patterns in different input source types across language groups and target child age.

*Section 3.* The third section shows the distribution of target child age by corpus and delves further into discussion of the lack of simple age effects in the primary analyses.

*Section 4.* The fourth section uses a set of alternative models of the three dependent variables---TCDS, CDS, and ADS---to examine cross-group differences in these input sources that are naïve to effects of number of talkers and talker type, effects which may partly reflect cultural patterns.

*Section 5.* The fifth section breaks up the North American English language group into individual corpora for those interested in examining potential differences in future work.

*Section 6.* The sixth section gives tabular regression outputs for the full binomial mixed-effects regression models of TCDS, CDS, and ADS from which the main-text and Section 1 of these Supplementary Materials are reported. Also provided are the full suite of alternative models for TCDS and ADS, in which we run one model each for all possible reference levels of language group.

*Section 7.* The seventh section shows a marginal means plot of model-estimated rates of TCDS and ADS rates across language group and age given that the main-text plot illustrates raw data with no age effects.

*Section 8.* The eighth and final section shows confusion matrices for addressee-type annotations (e.g., target-child versus other-child status of an utterance) overall and for each contributed corpus individually.

As in the main text for this study, all statistical analyses were conducted in R with the glmmTMB package [@brooks2017modeling; @R-base] and all figures were generated with ggplot2  [@R-ggplot2]. Analysis scripts and anonymized data are available at  URL_MASKED_FOR_REVIEW<!--https://github.com/aclew/random_sample_preliminary_results-->.

\newpage

```{r tbl-analysis-prep, include = FALSE}
## Get variables ready for modeling
# Quantity rand
nspkrs.m <- mean(quantity.rand$n_spkrs_clip)
nspkrs.sd <- sd(quantity.rand$n_spkrs_clip)
quantity.rand <- quantity.rand %>%
  mutate(
    tchiyr.std = ((age_mo_round - tchiyr.m)/tchiyr.sd),
    nsk.std = ((n_spkrs_clip - nspkrs.m)/nspkrs.sd))

# Quantity rand by speaker type
nspkrs.sa.m <- mean(quantity.rand.st$n_spkrs_clip.st)
nspkrs.sa.sd <- sd(quantity.rand.st$n_spkrs_clip.st)
quantity.rand.st <- quantity.rand.st %>%
  mutate(
    tchiyr.std = ((age_mo_round - tchiyr.m)/tchiyr.sd),
    nsk.st.std = ((n_spkrs_clip.st - nspkrs.sa.m)/nspkrs.sa.sd))
# add by-clip nsk scores (i.e., not divided by speaker type)
quantity.rand.st %<>%
  left_join(dplyr::select(quantity.rand, c(aclew_child_id, segment,
                                    n_spkrs_clip, nsk.std)))
# NA_English as reference group
quantity.rand$group_corpNE <- factor(
  quantity.rand$group_corpNE, levels = c(
    "NA_English", "UK_English", "Arg_Spanish", "Tseltal", "Yeli_Dnye"))
quantity.rand.st$group_corpNE <- factor(
  quantity.rand.st$group_corpNE, levels = c(
    "NA_English", "UK_English", "Arg_Spanish", "Tseltal", "Yeli_Dnye"))
# Women speakers as reference group
quantity.rand.st$SpkrType <- factor(quantity.rand.st$SpkrType, levels = c(
    "Woman", "Man", "Child"))
```

```{r tab4, echo=FALSE, message=FALSE, warning=FALSE, results = "asis"}
by.corp.rates.TDS <- quantity.rand.bychild %>%
  group_by(group_corpNE) %>%
  summarize(
    mean_tds.mph = mean(tds_mph),
    median_tds.mph = median(tds_mph),
    min_tds.mph = min(tds_mph),
    max_tds.mph = max(tds_mph))

by.corp.rates.OCDS <- quantity.rand.bychild %>%
  group_by(group_corpNE) %>%
  summarize(
    mean_ocds.mph = mean(ocds_mph),
    median_ocds.mph = median(ocds_mph),
    min_ocds.mph = min(ocds_mph),
    max_ocds.mph = max(ocds_mph))

by.corp.rates.ADS <- quantity.rand.bychild %>%
  group_by(group_corpNE) %>%
  summarize(
    mean_ads.mph = mean(ads_mph),
    median_ads.mph = median(ads_mph),
    min_ads.mph = min(ads_mph),
    max_ads.mph = max(ads_mph))

by.corp.rates <- by.corp.rates.TDS %>%
  left_join(by.corp.rates.OCDS, by = "group_corpNE") %>%
  left_join(by.corp.rates.ADS, by = "group_corpNE") %>%
  mutate(
    group_corpNE = factor(group_corpNE,
      labels = c("Arg. Spanish", "NA English", "Tseltal", "UK English",
        "Yélî Dnye")),
    group_corpNE = factor(group_corpNE,
      levels = c("NA English", "UK English", "Arg. Spanish",
        "Tseltal", "Yélî Dnye")),
    TDS.rate = paste0(round(mean_tds.mph,2), " (", round(median_tds.mph,2),
      "; ", round(min_tds.mph,2), "-", round(max_tds.mph,2), ")"),
    OCDS.rate = paste0(round(mean_ocds.mph,2), " (", round(median_ocds.mph,2),
      "; ", round(min_ocds.mph,2), "-", round(max_ocds.mph,2), ")"),
    ADS.rate = paste0(round(mean_ads.mph,2), " (", round(median_ads.mph,2),
      "; ", round(min_ads.mph,2), "-", round(max_ads.mph,2), ")"),
    mean.TDS.prop = round(mean_tds.mph/
                            (mean_tds.mph + mean_ocds.mph + mean_ads.mph), 2),
    mean.CDS.prop = round((mean_tds.mph + mean_ocds.mph)/
                            (mean_tds.mph + mean_ocds.mph + mean_ads.mph), 2)) %>%
  rename("Language" = "group_corpNE",
    `TCDS rate` = TDS.rate,
    `OCDS rate` = OCDS.rate,
    `ADS rate` = ADS.rate,
    `Mean proportion TCDS` = mean.TDS.prop,
    `Mean proportion any CDS (TCDS + CDS)` = mean.CDS.prop) %>%
  dplyr::select(Language, `TCDS rate`, `OCDS rate`, `ADS rate`,
         `Mean proportion TCDS`, `Mean proportion any CDS (TCDS + CDS)`) %>%
  arrange(Language)

apa_table(by.corp.rates, caption="Average input rates per clip across participants for each corpus. Parentheses following the mean indicate the median and range across participants. OCDS indicates rate of input directed to non-target-child children; CDS sums rates of TCDS and OCDS.")
```

```{r CDS_cross_corp_analysis, message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}
# CDS (speech from non-target child speakers directed to any child, which can
# include the target child when they are addressed w/ one other child PLUS
# speech exclusively directed to the target child ... i.e., all hearable CDS)
# TCDS + ODS
cds.rand.st.zinb <- glmmTMB(round(cds_mph.st,0) ~
    tchiyr.std + # no change with age (or slight increase)
    SpkrType + # Woman > {Man, Child}
    nsk.std + # more speakers = more speech
    group_corpNE + # more TCDS in WEIRD
    SpkrType:group_corpNE + # more child speakers in non-WEIRD
    tchiyr.std:SpkrType + # more TCDS from children (and perhaps also from men) with age
    (1|aclew_child_id),
  data=quantity.rand.st,
  # The probability of producing a structural zero
  ziformula=~tchiyr.std, # + group_corpNE, # removed group_corpNE for convergence
  family="nbinom1")
# summary(cds.rand.st.zinb)
# cds.rand.st.zinb.res = simulateResiduals(cds.rand.st.zinb)
# plot(cds.rand.st.zinb.res, rank = T)

cds.rand.st.zinb.disp <- round(sigma(cds.rand.st.zinb), 2)
cds.rand.st.zinb.COEF.age <-
  coef(summary(cds.rand.st.zinb))[[1]]["tchiyr.std",]
cds.rand.st.zinb.COEF.man <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeMan",]
cds.rand.st.zinb.COEF.child <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeChild",]
cds.rand.st.zinb.COEF.nsk <-
  coef(summary(cds.rand.st.zinb))[[1]]["nsk.std",]
cds.rand.st.zinb.COEF.spanish <-
  coef(summary(cds.rand.st.zinb))[[1]]["group_corpNEArg_Spanish",]
cds.rand.st.zinb.COEF.tseltal <-
  coef(summary(cds.rand.st.zinb))[[1]]["group_corpNETseltal",]
cds.rand.st.zinb.COEF.ukenglish <-
  coef(summary(cds.rand.st.zinb))[[1]]["group_corpNEUK_English",]
cds.rand.st.zinb.COEF.yelidnye <-
  coef(summary(cds.rand.st.zinb))[[1]]["group_corpNEYeli_Dnye",]
cds.rand.st.zinb.COEF.manSpanish <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeMan:group_corpNEArg_Spanish",]
cds.rand.st.zinb.COEF.childSpanish <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeChild:group_corpNEArg_Spanish",]
cds.rand.st.zinb.COEF.manTseltal <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeMan:group_corpNETseltal",]
cds.rand.st.zinb.COEF.childTseltal <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeChild:group_corpNETseltal",]
cds.rand.st.zinb.COEF.manUKEnglish <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeMan:group_corpNEUK_English",]
cds.rand.st.zinb.COEF.childUKEnglish <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeChild:group_corpNEUK_English",]
cds.rand.st.zinb.COEF.manYeliDnye <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeMan:group_corpNEYeli_Dnye",]
cds.rand.st.zinb.COEF.childYeliDnye <-
  coef(summary(cds.rand.st.zinb))[[1]]["SpkrTypeChild:group_corpNEYeli_Dnye",]
cds.rand.st.zinb.COEF.ageMan <-
  coef(summary(cds.rand.st.zinb))[[1]]["tchiyr.std:SpkrTypeMan",]
cds.rand.st.zinb.COEF.ageChild <-
  coef(summary(cds.rand.st.zinb))[[1]]["tchiyr.std:SpkrTypeChild",]
cds.rand.st.zinb.COEF.age.ZI <-
  coef(summary(cds.rand.st.zinb))[[2]]["tchiyr.std",]
# cds.rand.st.zinb.COEF.spanish.ZI <-
#   coef(summary(cds.rand.st.zinb))[[2]]["group_corpNEArg_Spanish",]
# cds.rand.st.zinb.COEF.tseltal.ZI <-
#   coef(summary(cds.rand.st.zinb))[[2]]["group_corpNETseltal",]
# cds.rand.st.zinb.COEF.ukenglish.ZI <-
#   coef(summary(cds.rand.st.zinb))[[2]]["group_corpNEUK_English",]
# cds.rand.st.zinb.COEF.yelidnye.ZI <-
#   coef(summary(cds.rand.st.zinb))[[2]]["group_corpNEYeli_Dnye",]

cds.model.table <- broom.mixed::tidy(cds.rand.st.zinb) %>%
    mutate(model = "CDS_random_z-inb")
write_csv(cds.model.table, "CDS_random_z-inb.csv")
```

```{r cdsmodelcoeffs}
cds.rand.st.zinb.CIs.countonly <- confint(cds.rand.st.zinb) %>%
  as_tibble(rownames = "Predictor") %>%
  filter(!(grepl("zi", Predictor)) & !(grepl("Intercept", Predictor)))
cds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "cond.|SpkrType|group_corpNE", "", cds.rand.st.zinb.CIs.countonly$Predictor)
cds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "_", " ", cds.rand.st.zinb.CIs.countonly$Predictor)
cds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "tchiyr.std", "Age", cds.rand.st.zinb.CIs.countonly$Predictor)
cds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "nsk.std", "# Talkers", cds.rand.st.zinb.CIs.countonly$Predictor)
cds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "Yeli Dnye", "Yélî Dnye", cds.rand.st.zinb.CIs.countonly$Predictor)
cds.rand.st.zinb.CIs.countonly$Predictor <- as.factor(
  cds.rand.st.zinb.CIs.countonly$Predictor)
cds.rand.st.zinb.CIs.countonly$Predictor <- factor(
  cds.rand.st.zinb.CIs.countonly$Predictor, levels = c(
   "Age", "# Talkers", "Man", "Child",
   "UK English", "Arg Spanish", "Tseltal", "Yélî Dnye",
   "Man:UK English", "Man:Arg Spanish", "Man:Tseltal", "Man:Yélî Dnye",
   "Child:UK English", "Child:Arg Spanish", "Child:Tseltal", "Child:Yélî Dnye",
   "Age:Man", "Age:Child"
  ))
cds.rand.st.zinb.CIs.countonly %<>%
  mutate(
    pointcolor = case_when(
      grepl("UK English", Predictor) ~ "yellow",
      grepl("Arg Spanish", Predictor) ~ "green",
      grepl("Tseltal", Predictor) ~ "blue",
      grepl("Yélî Dnye", Predictor) ~ "purple",
      TRUE ~ "black"
    ),
    shape = case_when(
      grepl("Man", Predictor) ~ "M",
      grepl("Child", Predictor) ~ "C",
      TRUE ~ ""
    )
  )
col.scale <- hue_pal()(5)

cds.mdl.plot <- ggplot(cds.rand.st.zinb.CIs.countonly,
                        aes(x = Estimate,
                            y = reorder(Predictor,desc(Predictor)),
                            xmin = `2.5 %`, xmax = `97.5 %`,
                            color = pointcolor, shape = shape)) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_linerange() +
  geom_point(data = filter(cds.rand.st.zinb.CIs.countonly, shape == ""),
             shape = 19) +
  geom_text(aes(label = shape)) +
  scale_color_manual(values = c(
    "black", col.scale[4], col.scale[3], col.scale[5], col.scale[2])) +
  ylab("Predictor") +
  xlab("Coefficient [95% CI]") +
  scale_x_continuous(breaks = c(-4, -3, -2, -1, 0, 1, 2, 3, 4)) +
  coord_cartesian(xlim = c(-4.3, 4.3)) +
  theme_apa() +
  theme(legend.position = "none")
```

```{r TCDS_cross_corp_analysis, message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}
# TCDS = any speech from non-target child speakers addressed to the target child
tds.rand.st.zinb <- glmmTMB(round(tds_mph.st,0) ~
    tchiyr.std + # no change with age (or slight increase)
    SpkrType + # Woman > {Man, Child}
    nsk.std + # more speakers = more speech
    group_corpNE + # more TCDS in WEIRD
    SpkrType:group_corpNE + # more child speakers in non-WEIRD
    tchiyr.std:SpkrType + # more TCDS from children (and perhaps also from men) with age
    (1|aclew_child_id),
  data=quantity.rand.st,
  # The probability of producing a structural zero
  ziformula=~tchiyr.std + group_corpNE,
  family="nbinom1")
# summary(tds.rand.st.zinb)
# tds.rand.st.zinb.res = simulateResiduals(tds.rand.st.zinb)
# plot(tds.rand.st.zinb.res, rank = T)
```

```{r tdsmodelcoeffs}
tds.rand.st.zinb.CIs.countonly <- confint(tds.rand.st.zinb) %>%
  as_tibble(rownames = "Predictor") %>%
  filter(!(grepl("zi", Predictor)) & !(grepl("Intercept", Predictor)))
tds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "cond.|SpkrType|group_corpNE", "", tds.rand.st.zinb.CIs.countonly$Predictor)
tds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "_", " ", tds.rand.st.zinb.CIs.countonly$Predictor)
tds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "tchiyr.std", "Age", tds.rand.st.zinb.CIs.countonly$Predictor)
tds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "nsk.std", "# Talkers", tds.rand.st.zinb.CIs.countonly$Predictor)
tds.rand.st.zinb.CIs.countonly$Predictor <- gsub(
  "Yeli Dnye", "Yélî Dnye", tds.rand.st.zinb.CIs.countonly$Predictor)
tds.rand.st.zinb.CIs.countonly$Predictor <- as.factor(
  tds.rand.st.zinb.CIs.countonly$Predictor)
tds.rand.st.zinb.CIs.countonly$Predictor <- factor(
  tds.rand.st.zinb.CIs.countonly$Predictor, levels = c(
   "Age", "# Talkers", "Man", "Child",
   "UK English", "Arg Spanish", "Tseltal", "Yélî Dnye",
   "Man:UK English", "Man:Arg Spanish", "Man:Tseltal", "Man:Yélî Dnye",
   "Child:UK English", "Child:Arg Spanish", "Child:Tseltal", "Child:Yélî Dnye",
   "Age:Man", "Age:Child"
  ))
tds.rand.st.zinb.CIs.countonly %<>%
  mutate(
    pointcolor = case_when(
      grepl("UK English", Predictor) ~ "yellow",
      grepl("Arg Spanish", Predictor) ~ "green",
      grepl("Tseltal", Predictor) ~ "blue",
      grepl("Yélî Dnye", Predictor) ~ "purple",
      TRUE ~ "black"
    ),
    shape = case_when(
      grepl("Man", Predictor) ~ "M",
      grepl("Child", Predictor) ~ "C",
      TRUE ~ ""
    )
  )
col.scale <- hue_pal()(5)

tds.mdl.plot <- ggplot(tds.rand.st.zinb.CIs.countonly,
                        aes(x = Estimate,
                            y = reorder(Predictor,desc(Predictor)),
                            xmin = `2.5 %`, xmax = `97.5 %`,
                            color = pointcolor, shape = shape)) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_linerange() +
  geom_point(data = filter(tds.rand.st.zinb.CIs.countonly, shape == ""),
             shape = 19) +
  geom_text(aes(label = shape)) +
  scale_color_manual(values = c(
    "black", col.scale[4], col.scale[3], col.scale[5], col.scale[2])) +
  ylab("Predictor") +
  xlab("Coefficient [95% CI]") +
  scale_x_continuous(breaks = c(-4, -3, -2, -1, 0, 1, 2, 3, 4)) +
  coord_cartesian(xlim = c(-4.3, 4.3)) +
  theme_apa() +
  theme(legend.position = "none")
```

```{r modelcoeffsplotstdscds, echo=FALSE, fig.align = "center", fig.cap="Coefficients and 95% confidence intervals for the count models of TCDS (left) and CDS (i.e. TCDS + OCDS; right) for all included fixed effects. This figure differs from the similar one in the main text, which by contrast features TCDS and ADS. Color indicates population (North American English is the modeled reference level), 'C' and 'M' indicate effects related to child- and man-produced speech, respectively (woman-produced speech is set as the model reference level).", out.width = '100%'}
# save the figure for later
ggarrange(tds.mdl.plot, cds.mdl.plot, ncol = 2, nrow = 1,
          labels = c("TCDS", "CDS")) %>%
  ggexport(filename = "plots/TCDS_CDS_mdl_summary.png",
           width = 2500, height = 1200,
           res = 300)

# show as saved
knitr::include_graphics("plots/TCDS_CDS_mdl_summary.png")
```

# All child-directed speech (CDS)

The analysis of directed linguistic input in the main text focuses on TCDS; that is, input that is exclusively directed to the target child. And yet other types of child-directed input in the environment may also contain the linguistic and communicative features that are associated with language learning. We here analyze _all_ CDS in the recordings using the same factors as we did for TCDS in the main text. "CDS" here includes all utterances directed to the target child, plus all other observable child-directed speech in the audio recording clips, including input directed at groups of children that may or may not include the target child. In other words "CDS" is here all hearable utterances that are directed to a child in the recording; comparable to what is measured in @bergelsoncasillas2019what. Therefore this measure of CDS includes all input designed for a child listener within earshot of the target child wearing the recorder. Keep in mind, however, that much of this input is likely to have been addressed to children of a different age than the target child, to children at a far distance to the target child, or even occasionally to children in a different language than what is typically used for the target child. We gloss over these issues here, as we do not have the annotations to tease each of them apart. Our aim instead is to provide a parallel statistical analysis of CDS to that of TCDS reported in the main text.

On average, across all language groups children were exposed to `r round(mean(quantity.rand.bychild$cds_mph),2)` minutes of CDS per hour across audio clips (median = `r round(median(quantity.rand.bychild$cds_mph),2)`), with wide variation between children (range = `r round(min(quantity.rand.bychild$cds_mph),2)`–`r round(max(quantity.rand.bychild$cds_mph),2)`). Our model of CDS rate was nearly identical to that used for TCDS rate in the main text: It included target child age, talker type, the number of talkers present in the clip, and language group, with two additional two-way interactions (talker type by language group and child age by talker type) and random intercepts by child, adding only child age in the zero-inflation model component. The only difference from the main-text model of TCDS was that we did not include language group as a predictor in the zero-inflation component because its inclusion caused model non-convergence issues. As a reminder, there was no significant effect of language group in the zero-inflation model component of the main-text TCDS model. This fact, together with the qualitatively similar pattern of findings for CDS in the present model suggests that the pattern of findings reported are robust to this small difference in model structure (N = `r nobs(cds.rand.st.zinb)`, log-likelihood = `r round(logLik(cds.rand.st.zinb)[1], 2)`, overdispersion estimate = `r cds.rand.st.zinb.disp`, formula = CDS.min.p.hr ~ child.age + talker.type + num.tlkrs.in.clip + lg.grp + talker.type:lg.grp + child.age:talker.type + (1 | child.id), ziformula = ~ child.age). The results are qualitatively highly similar to the TCDS model presented in the main text. The coefficients and 95% confidence intervals for all fixed effects in the CDS count model are shown in Figure \@ref(fig:modelcoeffsplotstdscds), side by side with the same plot from the TCDS model, which is replicated from the main text.

CDS input rate significantly differed by talker type, number of talkers present, language group, and the interaction between talker type and language group. As with TCDS rate, CDS rate was significantly lower for men compared to women (_B_ = `r round(cds.rand.st.zinb.COEF.man[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.man[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.man[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.man[[4]])`) and for children compared to women (_B_ = `r round(cds.rand.st.zinb.COEF.child[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.child[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.child[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.child[[4]])`). CDS rate was, like TCDS rate, also significantly higher when there were more talkers present (_B_ = `r round(cds.rand.st.zinb.COEF.nsk[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.nsk[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.nsk[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.nsk[[4]])`).

As with TCDS, rates of CDS in Yélî Dnye were significantly lower compared to North American English (_B_ = `r round(cds.rand.st.zinb.COEF.yelidnye[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.yelidnye[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.yelidnye[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.yelidnye[[4]])`), with no significant differences between North American English and the other language groups (all _p_'s ≥ .3).

Interactions between talker type and language group were overall similar, with some small differences. Men were previously found to produce significantly more TCDS in the Argentinian Spanish and Yélî Dnye samples compared to North American English. When this measure is changed to CDS, the difference only remains apparent for Argentinian Spanish compared to North American English (_B_ = `r round(cds.rand.st.zinb.COEF.manSpanish[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.manSpanish[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.manSpanish[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.manSpanish[[4]])`), though the Yélî Dnye data still point in the same direction (_B_ = `r round(cds.rand.st.zinb.COEF.manYeliDnye[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.manYeliDnye[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.manYeliDnye[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.manYeliDnye[[4]])`). Children were previously found to produce significantly more TCDS in all four of the non-North American English samples compared to North American English. When this measure is changed to CDS, the difference remains apparent for all cases except UK English, which still goes in the same direction (UK English: _B_ = `r round(cds.rand.st.zinb.COEF.childUKEnglish[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.childUKEnglish[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.childUKEnglish[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.childUKEnglish[[4]])`; Argentinian Spanish: _B_ = `r round(cds.rand.st.zinb.COEF.childSpanish[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.childSpanish[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.childSpanish[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.childSpanish[[4]])`; Tseltal: _B_ = `r round(cds.rand.st.zinb.COEF.childTseltal[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.childTseltal[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.childTseltal[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.childTseltal[[4]])`; Yélî Dnye: _B_ = `r round(cds.rand.st.zinb.COEF.childYeliDnye[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.COEF.childYeliDnye[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.COEF.childYeliDnye[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.COEF.childYeliDnye[[4]])`).

Interactions between talker type and age differed between TCDS and CDS. Whereas the previous analysis suggested that child-produced, but not man-produced, TCDS grows more with age compared to woman-produced TCDS, there are no significant differences across age by talker type with the input measure of CDS (Men _p_ `r pval.display(cds.rand.st.zinb.COEF.ageMan[[4]])`; Children _p_ `r pval.display(cds.rand.st.zinb.COEF.ageChild[[4]])`).

Like the model of TCDS rate, the zero-inflation regression component for CDS did not suggest any additional evidence for effects of child age (_p_ `r pval.display(cds.rand.st.zinb.COEF.age.ZI[[4]])`).

```{r nskplots, echo=FALSE, fig.align = "center", fig.cap="Number of talkers present across language groups, talker types, and target child age. Each datapoint represents the mean from one recording.", out.width = '100%'}
# descriptive stats for text
by.corp.nsk.avgs <- quantity.rand %>%
  group_by(group_corp, aclew_child_id) %>%
  summarize(
    mean = mean(n_spkrs_clip),
    median = median(n_spkrs_clip)) %>%
  group_by(group_corp) %>%
  summarize(
    m.mean = mean(mean),
    m.median = mean(median))

by.corp.nsk.avgs.st <- quantity.rand.st %>%
  group_by(group_corp, aclew_child_id, SpkrType) %>%
  summarize(mean = mean(n_spkrs_clip.st),
            median = median(n_spkrs_clip.st)) %>%
  group_by(group_corp, SpkrType) %>%
  summarize(m.mean = mean(mean),
            m.median = mean(median))

overall.nsk.avgs.st <- by.corp.nsk.avgs.st %>%
  group_by(SpkrType) %>%
  summarize(m.m.mean = mean(m.mean),
            m.m.median = mean(m.median))

# now actually for the plots...
by.rec.ests.nsk.age <- quantity.rand.st %>%
  group_by(group_corpNE, aclew_child_id, age_mo_round, SpkrType) %>%
  summarize(
    .groups = "drop",
    mean.nsk.st = mean(n_spkrs_clip.st),
    sd.nsk.st = sd(n_spkrs_clip.st),
    median.nsk.st = median(n_spkrs_clip.st),
    min.nsk.st = min(n_spkrs_clip.st),
    max.nsk.st = max(n_spkrs_clip.st))

by.rec.ests.nsk.age$Sample <- factor(
  by.rec.ests.nsk.age$group_corpNE, labels = c(
      "NA English", "UK English",
      "Arg. Spanish", "Tseltal", "Yélî Dnye"))
# by.rec.ests.xds.age$Sample <- factor(
#   by.rec.ests.xds.age$Sample, levels = c(
#       "NA English", "UK English", "Arg. Spanish",
#       "Tseltal", "Yélî Dnye"))

nsk.x.lg.age <- ggplot(by.rec.ests.nsk.age, aes(
  x = age_mo_round, y = mean.nsk.st,
  color = Sample, fill = Sample)) +
  facet_wrap(~ SpkrType) +
  scale_y_continuous(
    breaks = seq(0, 6, 1)) +
  coord_cartesian(ylim = c(0, 6)) +
  scale_x_continuous(
    breaks = seq(0, 36, 6)) +
  ylab("Mean # talkers") +
  geom_smooth(method = "lm", alpha = 0.15) +
  geom_jitter() +
  theme_apa() +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none"
  )

by.rec.ests.nsk.age.overall <- quantity.rand.st %>%
  group_by(group_corpNE, aclew_child_id, age_mo_round) %>%
  summarize(
    .groups = "drop",
    mean.nsk = mean(n_spkrs_clip),
    sd.nsk = sd(n_spkrs_clip),
    median.nsk = median(n_spkrs_clip),
    min.nsk = min(n_spkrs_clip),
    max.nsk = max(n_spkrs_clip))

by.rec.ests.nsk.age.overall$Sample <- factor(
  by.rec.ests.nsk.age.overall$group_corpNE, labels = c(
      "NA English", "UK English",
      "Arg. Spanish", "Tseltal", "Yélî Dnye"))

nsk.x.lg.age.overall <- ggplot(by.rec.ests.nsk.age.overall, aes(
  x = age_mo_round, y = mean.nsk,
  color = Sample, fill = Sample)) +
  scale_y_continuous(
    breaks = seq(0, 10, 1)) +
  coord_cartesian(ylim = c(0, 10)) +
  scale_x_continuous(
    breaks = seq(0, 36, 6)) +
  ylab("Mean # talkers") +
  geom_smooth(method = "lm", alpha = 0.15) +
  geom_jitter() +
  theme_apa() +
  theme(
    axis.title.x = element_blank(),
    legend.position = "bottom"
  )

nsk.plot <- nsk.x.lg.age.overall /
  nsk.x.lg.age

ggsave(
  "NTK_age_summary.png",
  plot = nsk.plot,
  device = "png",
  path = "plots/",
  scale = 1,
  width = 20,
  height = 15,
  units = "cm",
  dpi = 300)

# show as saved
knitr::include_graphics("plots/NTK_age_summary.png")
```

\newpage

# Typical numbers of talkers by corpus

Our primary statistical models account for the number of talkers present in a clip, with the idea that the presence of more talkers leads to more talk in the clip. This is trivially true in the sense that a talker in the clip isn't "counted" unless they talk at least once. The model effects of TCDS, CDS, and ADS, all show very strong effects of number of talkers in the clip, and suggest that the presence of others has a more-than-minimal effect on how much input children encounter, particularly for ADS. Further, we anticipated that some of the differences apparent between language groups are actually due to the greater or lesser number of people typically present around children. For example, we suspected that the organization of households and the number of children per household would lead to greater presence of both adults and children in the Yélî Dnye, Tseltal, and Argentinian Spanish recordings. Without controlling for the number of talkers present in our statistical models, it would have been impossible to tell what portion of cross-group differences in input rates is simply due to number of present talkers versus other differences in culture, language, and daily life. Our main-text results thus reflect estimates of cultural difference controlling for number of talkers as a separate and significant factor.

However, it is also the case that number of talkers may systematically differ between language groups in a way that approximates important cultural differences. We here analyze the number of talkers detected in clips across language groups and age, both overall and by talker type. Our aim is to illustrate the scale of cross-group differences in potential available interlocutors, which likely reflects differences in household size, household organization, and child caregiving practices. We only superficially characterize these differences here, leaving it to future work to more deeply engage with how these patterns reflect population-specific practices.

On average, and in addition to any vocalizations by the target child, a given audio clip included at least one utterance from `r round(mean(by.corp.nsk.avgs$m.mean),2)` other talkers (median = `r round(mean(by.corp.nsk.avgs$m.median),2)`; range over all clips = `r min(quantity.rand$n_spkrs_clip)`--`r max(quantity.rand$n_spkrs_clip)`). By talker type, those other interlocutors included an average of `r round(filter(overall.nsk.avgs.st, SpkrType == "Woman")$m.m.mean,2)` women, `r round(filter(overall.nsk.avgs.st, SpkrType == "Man")$m.m.mean,2)` men, and `r round(filter(overall.nsk.avgs.st, SpkrType == "Child")$m.m.mean,2)` children (medians are `r round(overall.nsk.avgs.st$m.m.median,2)[1]`, `r round(overall.nsk.avgs.st$m.m.median,2)[2]`, and `r round(overall.nsk.avgs.st$m.m.median,2)[3]`, respectively). However, these averages obscure significant cross-group variation, which is apparent in Figure \@ref(fig:nskplots). In particular, the Yélî Dnye recordings show much higher rates of other talker presence compared to the other language groups, with averages of `r round(filter(by.corp.nsk.avgs.st, group_corp == "Yeli_Dnye" & SpkrType == "Woman")$m.mean,2)` women, `r round(filter(by.corp.nsk.avgs.st, group_corp == "Yeli_Dnye" & SpkrType == "Man")$m.mean,2)` men, and `r round(filter(by.corp.nsk.avgs.st, group_corp == "Yeli_Dnye" & SpkrType == "Child")$m.mean,2)` children. Compare to North American English with averages of `r round(filter(by.corp.nsk.avgs.st, group_corp == "NA_English" & SpkrType == "Woman")$m.mean,2)` women, `r round(filter(by.corp.nsk.avgs.st, group_corp == "NA_English" & SpkrType == "Man")$m.mean,2)` men, and `r round(filter(by.corp.nsk.avgs.st, group_corp == "NA_English" & SpkrType == "Child")$m.mean,2)` children and UK English, with `r round(filter(by.corp.nsk.avgs.st, group_corp == "UK_English" & SpkrType == "Woman")$m.mean,2)` women, `r round(filter(by.corp.nsk.avgs.st, group_corp == "UK_English" & SpkrType == "Man")$m.mean,2)` men, and `r round(filter(by.corp.nsk.avgs.st, group_corp == "UK_English" & SpkrType == "Child")$m.mean,2)` children. The Tseltal and Argentinian Spanish communities fall somewhere between the Yélî Dnye and English-speaking groups, with the Tseltal group showing averages of `r round(filter(by.corp.nsk.avgs.st, group_corp == "Tseltal" & SpkrType == "Woman")$m.mean,2)` women, `r round(filter(by.corp.nsk.avgs.st, group_corp == "Tseltal" & SpkrType == "Man")$m.mean,2)` men, and `r round(filter(by.corp.nsk.avgs.st, group_corp == "Tseltal" & SpkrType == "Child")$m.mean,2)` children and the Argentinian Spanish group averages of `r round(filter(by.corp.nsk.avgs.st, group_corp == "Arg_Spanish" & SpkrType == "Woman")$m.mean,2)` women, `r round(filter(by.corp.nsk.avgs.st, group_corp == "Arg_Spanish" & SpkrType == "Man")$m.mean,2)` men, and `r round(filter(by.corp.nsk.avgs.st, group_corp == "Arg_Spanish" & SpkrType == "Child")$m.mean,2)` children.

Overall presence of other talkers looks similar across age, though we observe a slight downward trend in the number of women contributing input and a slight uptick in children contributing input in some groups. We do not statistically analyze these data given that the measure relies on the inferred number of talkers present rather than the actual number, which would require video data or time-sampled annotations [e.g., @cristia2017child]. Thereby the current measure gives insight into effects of household and routine by language group, but not adequately to make well-substantiated claims at present.

```{r ageplots, echo=FALSE, fig.align = "center", fig.cap="Target child age for each corpus. Each datapoint represents the mean from one recording.", out.width = '100%'}
age.by.corp.allNA <- quantity.rand %>%
  filter(group_corp == "NA_English") %>%
  dplyr::select(group_corp, aclew_child_id, age_mo_round) %>%
  distinct() %>%
  group_by(group_corp, age_mo_round) %>%
  summarize(
    .groups = "drop",
    n.recs = n()) %>%
  mutate(corpus = "NA English (all)") %>%
  select(-group_corp)

age.by.corp <- quantity.rand %>%
  mutate(indiv_corpus = case_when(
    group_corp == "NA_English" & corp == "BER" ~ "Northeastern US (Bergelson)",
    group_corp == "NA_English" & corp == "SOD" ~ "Central Canada (McDivitt-Winnipeg)",
    group_corp == "NA_English" & corp == "WAR" ~ "Western US (Warlaulmont)",
    TRUE ~ group_corp
  )) %>%
  dplyr::select(indiv_corpus, aclew_child_id, age_mo_round) %>%
  distinct() %>%
  group_by(indiv_corpus, age_mo_round) %>%
  summarize(
    .groups = "drop",
    n.recs = n()) %>%
  rename("corpus" = indiv_corpus) %>%
  full_join(age.by.corp.allNA)

age.by.corp$corpus <- factor(age.by.corp$corpus, labels = rev(c(
  "Yélî Dnye", "Western US (Warlaumont)", "UK English", "Tseltal",
  "Northeastern US (Bergelson)", "NA English (all)",
  "Central Canada (McDivitt-Winnipeg)", "Arg Spanish")))
age.by.corp$corpus <- factor(age.by.corp$corpus, levels = rev(c(
  "NA English (all)", "Northeastern US (Bergelson)",
  "Central Canada (McDivitt-Winnipeg)", "Western US (Warlaumont)",
  "UK English", "Arg Spanish", "Tseltal", "Yélî Dnye")))

age.by.corp.fig <- ggplot(age.by.corp, aes(
  x = age_mo_round, y = corpus)) +
  ylab("Corpus") +
  xlab("Target child age (mo)") +
  geom_density_ridges(
    alpha = 0.2,
    jittered_points = TRUE,
    position = position_points_jitter(width = 0.05, height = 0.1)) +
  xlim(0, 40) +
  theme_apa()

# save the figure for later
ggexport(age.by.corp.fig,
         filename = "plots/age_by_corpus_summary.png",
         width = 2500, height = 1875, res = 300)

# show as saved
knitr::include_graphics("plots/age_by_corpus_summary.png")

# age-by-corpus diffs
age.by.corp.nonsummed <- quantity.rand %>%
  mutate(indiv_corpus = case_when(
    group_corp == "NA_English" & corp == "BER" ~ "Northeastern US (Bergelson)",
    group_corp == "NA_English" & corp == "SOD" ~ "Central Canada (McDivitt-Winnipeg)",
    group_corp == "NA_English" & corp == "WAR" ~ "Western US (Warlaumont)",
    TRUE ~ group_corp
  )) %>%
  dplyr::select(group_corp, indiv_corpus, aclew_child_id, age_mo_round) %>%
  distinct() %>%
  rename("corpus" = indiv_corpus)
age.by.corp.nonsummed$group_corp <- factor(
  age.by.corp.nonsummed$group_corp, levels = c(
    "NA_English", "UK_English", "Arg_Spanish", "Tseltal", "Yeli_Dnye"))
# one rec per child, so no random effect of aclew id
age.by.corp.mdl <- lm(age_mo_round ~ group_corp,
                        data = age.by.corp.nonsummed)
# UK Eng, Arg Sp, and Tseltal kids are significantly older than NA Eng
```

\newpage

# (Non-/)effects of target child age by corpus

The distribution of child age across corpora (Figure \@ref(fig:ageplots)) varied somewhat due to the fact that each corpus was collected at a different time and for different purposes by the contributing researchers, long before the present study was initiated. If we use simple linear regression to analyze the sampled 69 recordings to test whether age differs across corpora, we do find that children are significantly younger in the North American English language groups compared to all the other corpora except Yélî Dnye (age.in.months ~ lg.grp; _p_ < .05 for UK English, Argentinian Spanish, and Tseltal and _p_ = `r round(coef(summary(age.by.corp.mdl))["group_corpYeli_Dnye",][4],3)` for Yélî Dnye).

In principle, age differences between language groups, even if present, are not necessarily a problem for the present study---in lay terms, each mixed-effects regression accounts for all modeled dimensions of each datapoint (including age, corpus, number of talkers in the clip, etc. in addition to a random effect of child) when estimating the direction and significance of impact that each predictor has on the dependent variable. That said, the relatively small sample size here combined with variance in age distribution between language groups could mean that there are true age effects present in the data that we cannot detect under present circumstances (i.e., supposing increased data would substantially change the linear fits of age in the model). As a reminder, we found no evidence for an overall effect of target child age (neither a decrease nor an increase) in the primary models of child-directed speech (i.e., TCDS in the main text and CDS here in the Supplementary Materials). We do find a significant decrease in ADS associated with child age. In our view, the primary concern is then whether we are missing overall effects of child age on TCDS and CDS. However, considering the much better age coverage of our four other language groups---which do not support overall age effects---and that the findings are in line with prior work on North American English showing no increase in CDS with age [@bergelsoncasillas2019what], we are satisfied with the current dataset and analysis.

We here visualize the effect of child age on the dependent variables of interest in each corpus (Figure \@ref(fig:xds-age-plots)) so that the interested reader can glean some informal and qualitative impression of potential differences that might be detected if more data were to be added in future work. Please note that any apparent visual differences here, as in the main text figures and tables, do not have the benefit of random-effects controls that are applied in our analysis via the use of statistical models.

```{r xds-age-plots, echo=FALSE, fig.align = "center", fig.cap="TCDS (above) and ADS (below) min/hr rates across language groups and talker types across target child age. Each datapoint represents the mean from one recording. This figure is similar to Figure 2 in the main text, but now additionall displays the data by child age. As is apparent, age effects are minimal for TCDS across language groups and talker type whereas there is a general decrease in ADS across language group and talker type.", out.width = '100%'}
by.rec.ests.xds.age <- quantity.rand.st %>%
  group_by(group_corpNE, aclew_child_id, age_mo_round, SpkrType) %>%
  summarize(
    .groups = "drop",
    mean.tds_mph = mean(tds_mph.st),
    mean.ads_mph = mean(ads_mph.st)) %>%
  mutate(
    SpkrType = case_when(
      SpkrType == "Woman" ~ "Women",
      SpkrType == "Man" ~ "Men",
      SpkrType == "Child" ~ "Children",
    )
  )

by.rec.ests.xds.age$Sample <- factor(
  by.rec.ests.xds.age$group_corpNE, labels = c(
      "NA English", "Arg. Spanish",
      "Tseltal", "UK English", "Yélî Dnye"))
by.rec.ests.xds.age$Sample <- factor(
  by.rec.ests.xds.age$Sample, levels = c(
      "NA English", "UK English", "Arg. Spanish",
      "Tseltal", "Yélî Dnye"))
by.rec.ests.xds.age$SpkrType <- factor(
  by.rec.ests.xds.age$SpkrType, levels = c(
      "Women", "Men", "Children"))
xds.y.unit <- 5

by.spkrtype.ylim.max.ads.age <- by.rec.ests.xds.age %>%
  group_by(age_mo_round, SpkrType) %>%
  summarize(
    .groups = "drop",
    mean.ads_mph = ceiling(
      max(mean.ads_mph)/xds.y.unit)*xds.y.unit + xds.y.unit) %>%
  left_join(distinct(dplyr::select(by.rec.ests.xds.age,
                                   c("Sample", "SpkrType"))))

ads.x.lg.age <- ggplot(by.rec.ests.xds.age, aes(
  x = age_mo_round, y = mean.ads_mph,
  color = Sample, fill = Sample)) +
  geom_blank(data = by.spkrtype.ylim.max.ads.age, aes(
  x = age_mo_round, y = mean.ads_mph,
  color = Sample, fill = Sample)) +
  facet_wrap(~ SpkrType, scales = "free_y") +
  expand_limits(y = 0) +
  scale_y_continuous(
    expand = c(0, 0),
    breaks = seq(0, max(
      by.spkrtype.ylim.max.ads.age$mean.ads_mph), xds.y.unit)) +
  scale_x_continuous(
    expand = c(0, 0),
    breaks = seq(0, 36, 6)) +
  ylab("Mean ADS min/hr") +
  xlab("Age (mo)") +
  geom_smooth(method = "lm", alpha = 0.15) +
  geom_jitter() +
  theme_apa() +
  theme(
    legend.position = "none",
    strip.text.x = element_blank()
  )

by.spkrtype.ylim.max.tds.age <- by.rec.ests.xds.age %>%
  group_by(SpkrType, age_mo_round) %>%
  summarize(
    .groups = "drop",
    mean.tds_mph = ceiling(
      max(mean.tds_mph)/xds.y.unit)*xds.y.unit
  ) %>%
  left_join(dplyr::select(by.rec.ests.xds.age,
                          c("Sample", "SpkrType")))

tds.x.lg.age <- ggplot(by.rec.ests.xds.age, aes(
  x = age_mo_round, y = mean.tds_mph,
  color = Sample, fill = Sample)) +
  geom_blank(data = by.spkrtype.ylim.max.tds.age, aes(
  x = age_mo_round, y = mean.tds_mph,
  color = Sample, fill = Sample)) +
  facet_wrap(~ SpkrType, scales = "free_y") +
  expand_limits(y = 0) +
  scale_y_continuous(
    expand = c(0, 0),
    breaks = seq(0, max(
      by.spkrtype.ylim.max.tds.age$mean.tds_mph), xds.y.unit)) +
  scale_x_continuous(
    expand = c(0, 0),
    breaks = seq(0, 36, 6)) +
  ylab("Mean TCDS min/hr") +
  geom_smooth(method = "lm", alpha = 0.15) +
  geom_jitter() +
  theme_apa() +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none"
  )

# save the figure for later
ggarrange(tds.x.lg.age, ads.x.lg.age, ncol = 1, nrow = 2,
          common.legend = TRUE, legend = "bottom") %>%
  ggexport(filename = "plots/XDS_age_summary.png",
           width = 2500, height = 1875,
           res = 300)

# show as saved
knitr::include_graphics("plots/XDS_age_summary.png")
```


\newpage

# Simple models of age and cross-group difference in TCDS, CDS, and ADS rates

Up until now we have analyzed cross-corpus and age-based differences in TCDS, CDS, and ADS rate while _also_ accounting for other factors that may drive variation in input rate. These factors include: the number of talkers known to be present in a given clip and the different talker types who produce this talk (male and female adults and non-target children). There are arguments for and against including these factors in our model of cross-cultural differences, depending on one's theoretical goals.

By including these factors in the model, as we have in the main-text models of TCDS and ADS and in the model of CDS above, we can gain a more detailed perspective on the shared features that drive variation in input rate between and within language groups. For example, by adding in the number of talkers in a clip to our model, we can account for the fact that the presence of more people generally leads to more talk, regardless of the child's developmental context---indeed, we find that this effect drives variation in general and thereby affects children regardless of whether they grow up in North America, the UK, Argentina, Chiapas, or Rossel Island. A similar case can be made for talker types: the fact that women are more likely to produce all three types of input than men or children illustrates a general finding that cross-cuts the language groups, though our main models show that this effect of talker type is slightly different from context to context. By modeling these effects, we can make fairly detailed predictions about the input a child is likely to hear in a given clip (e.g., we can predict how much ADS a Tseltal-acquiring child at age 12 months with 4 other voices present will hear, and how likely it is to come from a woman vsersus a child versus a man).

However, a valid alternative perspective is that these cross-corpus differences in the type and number of talkers are reflective of the children's broader cultural and linguistic milieu and therefore variance due to these factors should not be separately accounted for in the model if the end goal is to obtain a general picture of the differences in children's language experiences across these communities. Consider number of talkers present in the clip.

As shown in Supplementary Materials Section 2 above, there is systematic variation across our language groups in the number of present talkers: for example, Yélî Dnye-acquiring children are surrounded by substantially more talkers than children in the other groups. There may be two ways of looking at Yélî children's experience of TCDS: (1) All else being equal, Yélî Dnye-acquiring children hear significantly less TCDS compared to North American English, _but_ the situation is not equal; because they have so many more people present than the North American English case, their overall TCDS input experienced is the same as (if not more than) what is heard by North American English-acquiring children or (2) considering children's overall linguistic environment, Yélî Dnye-acquiring children hear approximately the same rates of TCDS as North American English-acquiring children, if not more.

As the reader can tell, the first interpretation provides greater nuance, but more importantly, it speaks to useful avenues forward in understanding consistent and observable levers of cross-cultural difference (e.g., number of talkers present as a proxy for household size and composition or everyday routines; types of talker input as a proxy for alloparenting practices and (non-)overlap in work versus home settings). It can, however, obscure overall differences that are apparent when all these cultural effects add up in an individual child's experience.

In this analysis, therefore, we replicate our models of TCDS, CDS, and ADS, only now removing predictors relating to number of talkers present and type of talker. Therefore each count model only includes effects of child age (in months; centered and standardized) and language group (North American English/UK English/Argentinian Spanish/Tseltal/Yélî Dnye), and the zero-inflation component includes the same two predictors, with a random effect of child (formula = XDS.min.p.hr ~ child.age + lg.grp + (1 | child.id), ziformula = ~ child.age + lg.grp).

```{r simplermodels_cross_corp_analysis, message=FALSE, warning=FALSE, include=FALSE, paged.print=FALSE}
# Overall differences between corpora
tds.rand.st.zinb.mini <- glmmTMB(round(tds_mph,0) ~
    tchiyr.std + # no change with age (or slight increase)
    group_corpNE + # more TCDS in WEIRD
    (1|aclew_child_id),
  data=quantity.rand,
  # The probability of producing a structural zero
  ziformula=~tchiyr.std + group_corpNE,
  family="nbinom1")
# summary(tds.rand.st.zinb.mini)
# tds.rand.st.zinb.mini.res = simulateResiduals(tds.rand.st.zinb.mini)
# plot(tds.rand.st.zinb.mini.res, rank = T)
tds.rand.st.zinb.mini.disp <- round(sigma(tds.rand.st.zinb.mini), 2)
tds.rand.st.zinb.mini.COEF.age <-
  coef(summary(tds.rand.st.zinb.mini))[[1]]["tchiyr.std",]
tds.rand.st.zinb.mini.COEF.spanish <-
  coef(summary(tds.rand.st.zinb.mini))[[1]]["group_corpNEArg_Spanish",]
tds.rand.st.zinb.mini.COEF.tseltal <-
  coef(summary(tds.rand.st.zinb.mini))[[1]]["group_corpNETseltal",]
tds.rand.st.zinb.mini.COEF.ukenglish <-
  coef(summary(tds.rand.st.zinb.mini))[[1]]["group_corpNEUK_English",]
tds.rand.st.zinb.mini.COEF.yelidnye <-
  coef(summary(tds.rand.st.zinb.mini))[[1]]["group_corpNEYeli_Dnye",]
# zi
tds.rand.st.zinb.mini.COEF.age.ZI <-
  coef(summary(tds.rand.st.zinb.mini))[[2]]["tchiyr.std",]
tds.rand.st.zinb.mini.COEF.spanish.ZI <-
  coef(summary(tds.rand.st.zinb.mini))[[2]]["group_corpNEArg_Spanish",]
tds.rand.st.zinb.mini.COEF.tseltal.ZI <-
  coef(summary(tds.rand.st.zinb.mini))[[2]]["group_corpNETseltal",]
tds.rand.st.zinb.mini.COEF.ukenglish.ZI <-
  coef(summary(tds.rand.st.zinb.mini))[[2]]["group_corpNEUK_English",]
tds.rand.st.zinb.mini.COEF.yelidnye.ZI <-
  coef(summary(tds.rand.st.zinb.mini))[[2]]["group_corpNEYeli_Dnye",]


cds.rand.st.zinb.mini <- glmmTMB(round(cds_mph,0) ~
    tchiyr.std + # no change with age (or slight increase)
    group_corpNE + # more TCDS in WEIRD
    (1|aclew_child_id),
  data=quantity.rand,
  # The probability of producing a structural zero
  ziformula=~tchiyr.std + group_corpNE,
  family="nbinom1")
# summary(cds.rand.st.zinb.mini)
# cds.rand.st.zinb.mini.res = simulateResiduals(cds.rand.st.zinb.mini)
# plot(cds.rand.st.zinb.mini.res, rank = T)
cds.rand.st.zinb.mini.disp <- round(sigma(cds.rand.st.zinb.mini), 2)
cds.rand.st.zinb.mini.COEF.age <-
  coef(summary(cds.rand.st.zinb.mini))[[1]]["tchiyr.std",]
cds.rand.st.zinb.mini.COEF.spanish <-
  coef(summary(cds.rand.st.zinb.mini))[[1]]["group_corpNEArg_Spanish",]
cds.rand.st.zinb.mini.COEF.tseltal <-
  coef(summary(cds.rand.st.zinb.mini))[[1]]["group_corpNETseltal",]
cds.rand.st.zinb.mini.COEF.ukenglish <-
  coef(summary(cds.rand.st.zinb.mini))[[1]]["group_corpNEUK_English",]
cds.rand.st.zinb.mini.COEF.yelidnye <-
  coef(summary(cds.rand.st.zinb.mini))[[1]]["group_corpNEYeli_Dnye",]
# zi
cds.rand.st.zinb.mini.COEF.age.ZI <-
  coef(summary(cds.rand.st.zinb.mini))[[2]]["tchiyr.std",]
cds.rand.st.zinb.mini.COEF.spanish.ZI <-
  coef(summary(cds.rand.st.zinb.mini))[[2]]["group_corpNEArg_Spanish",]
cds.rand.st.zinb.mini.COEF.tseltal.ZI <-
  coef(summary(cds.rand.st.zinb.mini))[[2]]["group_corpNETseltal",]
cds.rand.st.zinb.mini.COEF.ukenglish.ZI <-
  coef(summary(cds.rand.st.zinb.mini))[[2]]["group_corpNEUK_English",]
cds.rand.st.zinb.mini.COEF.yelidnye.ZI <-
  coef(summary(cds.rand.st.zinb.mini))[[2]]["group_corpNEYeli_Dnye",]


ads.rand.st.zinb.mini <- glmmTMB(round(ads_mph,0) ~
    tchiyr.std + # decrease with age
    group_corpNE + # more ADS in non-WEIRD
    (1|aclew_child_id),
  data=quantity.rand,
  # The probability of producing a structural zero
  ziformula=~tchiyr.std + group_corpNE,
  family="nbinom1")
# summary(ads.rand.st.zinb.mini)
# ads.rand.st.zinb.mini.res = simulateResiduals(ads.rand.st.zinb.mini)
# plot(ads.rand.st.zinb.mini.res, rank = T)
ads.rand.st.zinb.mini.disp <- round(sigma(ads.rand.st.zinb.mini), 2)
ads.rand.st.zinb.mini.COEF.age <-
  coef(summary(ads.rand.st.zinb.mini))[[1]]["tchiyr.std",]
ads.rand.st.zinb.mini.COEF.spanish <-
  coef(summary(ads.rand.st.zinb.mini))[[1]]["group_corpNEArg_Spanish",]
ads.rand.st.zinb.mini.COEF.tseltal <-
  coef(summary(ads.rand.st.zinb.mini))[[1]]["group_corpNETseltal",]
ads.rand.st.zinb.mini.COEF.ukenglish <-
  coef(summary(ads.rand.st.zinb.mini))[[1]]["group_corpNEUK_English",]
ads.rand.st.zinb.mini.COEF.yelidnye <-
  coef(summary(ads.rand.st.zinb.mini))[[1]]["group_corpNEYeli_Dnye",]
# zi
ads.rand.st.zinb.mini.COEF.age.ZI <-
  coef(summary(ads.rand.st.zinb.mini))[[2]]["tchiyr.std",]
ads.rand.st.zinb.mini.COEF.spanish.ZI <-
  coef(summary(ads.rand.st.zinb.mini))[[2]]["group_corpNEArg_Spanish",]
ads.rand.st.zinb.mini.COEF.tseltal.ZI <-
  coef(summary(ads.rand.st.zinb.mini))[[2]]["group_corpNETseltal",]
ads.rand.st.zinb.mini.COEF.ukenglish.ZI <-
  coef(summary(ads.rand.st.zinb.mini))[[2]]["group_corpNEUK_English",]
ads.rand.st.zinb.mini.COEF.yelidnye.ZI <-
  coef(summary(ads.rand.st.zinb.mini))[[2]]["group_corpNEYeli_Dnye",]

```

## Target-child-directed speech (TCDS)

The count model of the simpler regression of TCDS (N = `r nobs(tds.rand.st.zinb.mini)`, log-likelihood = `r round(logLik(tds.rand.st.zinb.mini)[1], 2)`, overdispersion estimate = `r tds.rand.st.zinb.mini.disp`) showed no effects of child age or language group (all _p_'s > .09). The zero-inflation component similarly showed no evidence for significant effects of age or language group (all _p_'s ≥ .15).

## All child-directed speech (CDS)

The count model of the simpler regression of CDS (N = `r nobs(cds.rand.st.zinb.mini)`, log-likelihood = `r round(logLik(cds.rand.st.zinb.mini)[1], 2)`, overdispersion estimate = `r cds.rand.st.zinb.mini.disp`) showed no effect of child age (_B_ = `r round(cds.rand.st.zinb.mini.COEF.age[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.mini.COEF.age[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.mini.COEF.age[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.mini.COEF.age[[4]])`) but a significant effect of language group: CDS rates were significantly higher for Yélî Dnye-acquiring children compared to North American English-acquiring children (_B_ = `r round(cds.rand.st.zinb.mini.COEF.yelidnye[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.mini.COEF.yelidnye[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.mini.COEF.yelidnye[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.mini.COEF.yelidnye[[4]])`). No other language group showed significant difference in the rate of CDS compared to North American English (all _p_'s ≥ 0.2). However, in this case the zero-inflation component showed that both Tseltal and Yélî Dnye were significantly less likely than North American English to have clips with zero CDS (Tseltal: _B_ = `r round(cds.rand.st.zinb.mini.COEF.tseltal.ZI[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.mini.COEF.tseltal.ZI[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.mini.COEF.tseltal.ZI[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.mini.COEF.tseltal.ZI[[4]])`; Yélî Dnye: _B_ = `r round(cds.rand.st.zinb.mini.COEF.yelidnye.ZI[[1]],2)`, _SE_ = `r round(cds.rand.st.zinb.mini.COEF.yelidnye.ZI[[2]],2)`, _z_ = `r round(cds.rand.st.zinb.mini.COEF.yelidnye.ZI[[3]],2)`, _p_ `r pval.display(cds.rand.st.zinb.mini.COEF.yelidnye.ZI[[4]])`; UK English _p_ = .49; Arg. Spanish _p_ = .99). Put differently, the combined outcomes of the model components show that zero-CDS clips were significantly more likely for North American English-acquiring children than Tseltal and Yélî Dnye-acquiring children and that, for clips with some non-zero amount of CDS, the rate of CDS is significantly higher for Yélî Dnye-acquiring children than North American English-acquiring children. The zero-inflation component gave no evidence for an effect of child age (_p_ `r pval.display(cds.rand.st.zinb.mini.COEF.age.ZI[[4]])`).

## Adult-directed speech (ADS)

The count model of the simpler regression of ADS (N = `r nobs(ads.rand.st.zinb.mini)`, log-likelihood = `r round(logLik(ads.rand.st.zinb.mini)[1], 2)`, overdispersion estimate = `r ads.rand.st.zinb.mini.disp`) showed significant effects of both child age and language group. ADS decreased with child age (_B_ = `r round(ads.rand.st.zinb.mini.COEF.age[[1]],2)`, _SE_ = `r round(ads.rand.st.zinb.mini.COEF.age[[2]],2)`, _z_ = `r round(ads.rand.st.zinb.mini.COEF.age[[3]],2)`, _p_ `r pval.display(ads.rand.st.zinb.mini.COEF.age[[4]])`). ADS rates were also significantly higher in Yélî Dnye compared to North American English (_B_ = `r round(ads.rand.st.zinb.mini.COEF.yelidnye[[1]],2)`, _SE_ = `r round(ads.rand.st.zinb.mini.COEF.yelidnye[[2]],2)`, _z_ = `r round(ads.rand.st.zinb.mini.COEF.yelidnye[[3]],2)`, _p_ `r pval.display(ads.rand.st.zinb.mini.COEF.yelidnye[[4]])`). No other language group showed significant difference in the rate of ADS compared to North American English (all _p_ > 0.19). As with CDS, the zero-inflation model component revealed further structure in the data: zero-ADS clips were significantly less likely in Yélî Dnye data compared to North American English (Yélî Dnye: _B_ = `r round(ads.rand.st.zinb.mini.COEF.yelidnye.ZI[[1]],2)`, _SE_ = `r round(ads.rand.st.zinb.mini.COEF.yelidnye.ZI[[2]],2)`, _z_ = `r round(ads.rand.st.zinb.mini.COEF.yelidnye.ZI[[3]],2)`, _p_ `r pval.display(ads.rand.st.zinb.mini.COEF.yelidnye.ZI[[4]])`; Tseltal: _B_ = `r round(ads.rand.st.zinb.mini.COEF.tseltal.ZI[[1]],2)`, _SE_ = `r round(ads.rand.st.zinb.mini.COEF.tseltal.ZI[[2]],2)`, _z_ = `r round(ads.rand.st.zinb.mini.COEF.tseltal.ZI[[3]],2)`, _p_ `r pval.display(ads.rand.st.zinb.mini.COEF.tseltal.ZI[[4]])`; UK English _p_ = .31; Arg. Spanish _p_ = .20). Again then, the combined output of the model shows that zero-ADS clips were significantly more likely for North American English-acquiring children than Yélî Dnye-acquiring children and that, for clips with non-zero amounts of ADS, the rate of ADS is significantly higher for Yélî Dnye-acquiring children than North American English-acquiring children. Consistent with the other models, there was no evidence of an age effect in the zero-inflation model component (_p_ `r pval.display(ads.rand.st.zinb.mini.COEF.age.ZI[[4]])`).

Pulling these results together with those reported in the main text (TCDS, ADS) and above (CDS), two primary points are worth highlighting. First, Yélî Dnye looks very different when number of talkers is removed from the model---it has equivalent overall rates of TCDS, higher rates of CDS and ADS, and is less likely to have a zero-CDS or zero-ADS clip compared to North American English. This pattern falls in line with the main-text results and the fact that there are simply more people present in the language environment of Yélî Dnye-acquiring kids compared to the other language groups included here. Second, in this simplified analysis approach we lose sight of the critical and cross-group effects that account for fluctuations in talker presence and types of talkers present that we know, from the primary analyses, have a significant impact on the data.

\newpage

# Individual North American English data

In the main-text analyses we pool together the North American English datasets. We here present the primary descriptive figure from the main text, but with data broken out by individual corpus and not by language group.

```{r xds-byfinecorp-plots, echo=FALSE, fig.align = "center", fig.cap="TCDS (above) and ADS (below) min/hr rates across individual North American English corpora and talker types across target child age. Each datapoint represents the mean from one recording.", out.width = '100%'}
by.rec.ests.xds.corp <- quantity.rand.st %>%
  filter(group_corp == "NA_English") %>%
  group_by(corp, aclew_child_id, age_mo_round, SpkrType) %>%
  summarize(
    .groups = "drop",
    mean.tds_mph = mean(tds_mph.st),
    mean.ads_mph = mean(ads_mph.st)) %>%
  mutate(
    SpkrType = case_when(
      SpkrType == "Woman" ~ "Women",
      SpkrType == "Man" ~ "Men",
      SpkrType == "Child" ~ "Children",
    )
  )

by.rec.ests.xds.corp$Sample <- factor(by.rec.ests.xds.corp$corp, labels = c(
      "Northeastern US (Bergelson)", "Central Canada (McDivitt-Winnipeg)", "Western US (Warlaumont)"))
by.rec.ests.xds.corp$SpkrType <- factor(by.rec.ests.xds.corp$SpkrType, levels = c(
      "Women", "Men", "Children"))
xds.y.unit <- 5

by.spkrtype.ylim.max.ads.corp <- by.rec.ests.xds.corp %>%
  group_by(SpkrType, age_mo_round) %>%
  summarize(
    .groups = "drop",
    mean.ads_mph = ceiling(max(mean.ads_mph)/xds.y.unit)*xds.y.unit + xds.y.unit) %>%
  left_join(distinct(dplyr::select(by.rec.ests.xds.corp, c("Sample", "SpkrType"))))

ads.x.lg.corp <- ggplot(by.rec.ests.xds.corp, aes(
  x = age_mo_round, y = mean.ads_mph,
  color = Sample, fill = Sample)) +
  geom_blank(data = by.spkrtype.ylim.max.ads.corp, aes(
  x = age_mo_round, y = mean.ads_mph,
  color = Sample, fill = Sample)) +
  facet_wrap(~ SpkrType, scales = "free_y") +
  expand_limits(y = 0) +
  scale_y_continuous(
    expand = c(0, 0),
    breaks = seq(0, max(
      by.spkrtype.ylim.max.tds.age$mean.tds_mph), xds.y.unit)) +
  scale_x_continuous(
    expand = c(0, 0),
    breaks = seq(0, 36, 6)) +
  ylab("Mean ADS min/hr") +
  xlab("Age (mo)") +
  geom_smooth(method = "lm", alpha = 0.15) +
  geom_jitter() +
  theme_apa() +
  theme(
    legend.position = "none",
    strip.text.x = element_blank()
  )

by.spkrtype.ylim.max.tds.corp <- by.rec.ests.xds.corp %>%
  group_by(SpkrType, age_mo_round) %>%
  summarize(
    .groups = "drop",
    mean.tds_mph = ceiling(max(mean.tds_mph)/xds.y.unit)*xds.y.unit
  ) %>%
  left_join(dplyr::select(by.rec.ests.xds.corp, c("Sample", "SpkrType")))

tds.x.lg.corp <- ggplot(by.rec.ests.xds.corp, aes(
  x = age_mo_round, y = mean.tds_mph,
  color = Sample, fill = Sample)) +
  geom_blank(data = by.spkrtype.ylim.max.tds.corp, aes(
  x = age_mo_round, y = mean.tds_mph,
  color = Sample, fill = Sample)) +
  facet_wrap(~ SpkrType, scales = "free_y") +
  expand_limits(y = 0) +
  scale_y_continuous(
    expand = c(0, 0),
    breaks = seq(0, max(
      by.spkrtype.ylim.max.tds.age$mean.tds_mph), xds.y.unit)) +
  scale_x_continuous(
    expand = c(0, 0),
    breaks = seq(0, 36, 6)) +
  ylab("Mean TCDS min/hr") +
  geom_smooth(method = "lm", alpha = 0.15) +
  geom_jitter() +
  theme_apa() +
  theme(
    axis.title.x = element_blank(),
    legend.position = "none"
  )

# save the figure for later
ggarrange(tds.x.lg.corp, ads.x.lg.corp, ncol = 1, nrow = 2,
          common.legend = TRUE, legend = "bottom") %>%
  ggexport(filename = "plots/XDS_indivNAcorpus_summary.png",
           width = 2500, height = 1875,
           res = 300)

# show as saved
knitr::include_graphics("plots/XDS_indivNAcorpus_summary.png")
```


\newpage

# Full model output for TCDS, CDS, and ADS

The full zero-inflated negative binomial mixed-effects regression output tables for TCDS, CDS, and ADS rate are presented below. Along with the output tables of TCDS and ADS we show the alternative models with other reference levels for language group.