From 8a1aadfc88fb20fc26ba37b266c54ccb1ff7cfcb Mon Sep 17 00:00:00 2001 From: Mason Garrison <6001608+smasongarrison@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:44:31 -0500 Subject: [PATCH 01/27] cran prep --- cran-comments.md | 8 +++++--- vignettes/v1_modelingvariancecomponents.html | 10 +++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cran-comments.md b/cran-comments.md index fbd7861c..ea62ca7d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -18,7 +18,7 @@ This update includes minor enhancements and bug fixes related to how string ids ## R CMD check results ── R CMD check results ───────────────────────────────────────────────────────────────────────── BGmisc 1.5.2 ──── -Duration: 1m 26.1s +Duration: 1m 35.9s 0 errors ✔ | 0 warnings ✔ | 0 notes ✔ @@ -26,10 +26,12 @@ R CMD check succeeded ## revdepcheck results -We checked 2 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. +We checked 2 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. * We saw 0 new problems * We failed to check 0 packages + +The development version of ggpedigree resolves "E: 1" seen in the CRAN version. I maintain both packages, so once the latest version of BGmisc is on CRAN, I will submit the updated ggpedigree version. > revdepcheck::revdep_check(num_workers = 4) ── INSTALL ────────────────────────────────────────────────────────── 2 versions ── @@ -39,4 +41,4 @@ We checked 2 reverse dependencies, comparing R CMD check results across CRAN and OK: 2 BROKEN: 0 -Total time: 4 min +Total time: 3 min diff --git a/vignettes/v1_modelingvariancecomponents.html b/vignettes/v1_modelingvariancecomponents.html index 3060a1e0..84a077f9 100644 --- a/vignettes/v1_modelingvariancecomponents.html +++ b/vignettes/v1_modelingvariancecomponents.html @@ -431,7 +431,7 @@

Using identifyComponentModel Function

#> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union -# require(purrr) + selVars <- c("ht1", "ht2") @@ -480,8 +480,8 @@

Using identifyComponentModel Function

#> AIC: -5917.148 -3685.148 -3685.078 #> BIC: -10747.543 -3667.773 -3680.471 #> To get additional fit indices, see help(mxRefModels) -#> timestamp: 2026-01-07 10:29:53 -#> Wall clock time: 0.05330706 secs +#> timestamp: 2026-01-07 10:34:22 +#> Wall clock time: 0.0555501 secs #> optimizer: SLSQP #> OpenMx version number: 2.22.10 #> Need help? See help(mxSummary) @@ -523,8 +523,8 @@

Using identifyComponentModel Function

#> AIC: -9113.092 -5499.092 -5499.048 #> BIC: -17811.437 -5479.794 -5492.498 #> To get additional fit indices, see help(mxRefModels) -#> timestamp: 2026-01-07 10:29:53 -#> Wall clock time: 0.05561519 secs +#> timestamp: 2026-01-07 10:34:22 +#> Wall clock time: 0.04742312 secs #> optimizer: SLSQP #> OpenMx version number: 2.22.10 #> Need help? See help(mxSummary) From d8ea03d6b02e48d180f7e11a70c2bd7380ac31e1 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Fri, 23 Jan 2026 10:13:26 -0500 Subject: [PATCH 02/27] Update buildComponent.R --- R/buildComponent.R | 24 +++++++++++++++------ vignettes/v1_modelingvariancecomponents.Rmd | 3 ++- vignettes/v2_pedigree.Rmd | 1 + 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/R/buildComponent.R b/R/buildComponent.R index 3c2ac464..60ca7d01 100644 --- a/R/buildComponent.R +++ b/R/buildComponent.R @@ -1,9 +1,7 @@ #' Take a pedigree and turn it into a relatedness matrix #' @param ped a pedigree dataset. Needs ID, momID, and dadID columns #' @param component character. Which component of the pedigree to return. See Details. -#' @param max_gen the maximum number of generations to compute -#' (e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. -#' `Inf` uses as many generations as there are in the data. +#' @param max_gen the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25. #' @param sparse logical. If TRUE, use and return sparse matrices from Matrix package #' @param verbose logical. If TRUE, print progress through stages of algorithm #' @param update_rate numeric. The rate at which to print progress @@ -100,6 +98,7 @@ ped2com <- function(ped, component, "tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star" ) + if (!config$transpose_method %in% transpose_method_options) { stop(paste0( "Invalid method specified. Choose from ", @@ -107,10 +106,12 @@ ped2com <- function(ped, component, )) } - - if (!config$adjacency_method %in% - c("indexed", "loop", "direct", "beta")) { - stop("Invalid method specified. Choose from 'indexed', 'loop', 'direct', or 'beta'.") + # Validate the 'adjacency_method' argument + adjacency_method_options <- c("indexed", "loop", "direct", "beta") + if (!config$adjacency_method %in% adjacency_method_options + ) { + stop(paste0("Invalid method specified. Choose from ", + paste(adjacency_method_options, collapse = ", "), ".")) } # standardize colnames @@ -215,12 +216,21 @@ ped2com <- function(ped, component, count <- 0 } maxCount <- config$max_gen + 1 + if (config$verbose == TRUE) { cat("About to do RAM path tracing\n") } # r is I + A + A^2 + ... = (I-A)^-1 from RAM # could trim, here + ## it keeps going until it explains all of the relatedness with themselves (i.e., mtSum == 0) + # some of this precision is articifuial because we literally get to the point that the condon is eaither there or not. probabiliticy + + # how much percision do we need to get unbiased estimates + + # big matrix still happens here because the network is built. just less percise on inbreeding + + # bias-precision tradeoff. how much percision do we need to get unbiased estimates? not a lot while (mtSum != 0 && count < maxCount) { r <- r + newIsPar gen <- gen + (Matrix::rowSums(newIsPar) > 0) diff --git a/vignettes/v1_modelingvariancecomponents.Rmd b/vignettes/v1_modelingvariancecomponents.Rmd index ee18b269..8a771263 100644 --- a/vignettes/v1_modelingvariancecomponents.Rmd +++ b/vignettes/v1_modelingvariancecomponents.Rmd @@ -98,6 +98,7 @@ identifyComponentModel( As you can see the model is identified, now that we've added another group. Let us confirm by fitting a model. First we prepare the data. ```{r, include=FALSE} +library(dplyr) if (!requireNamespace("OpenMx", quietly = TRUE)) { # if OpenMx isn't available n_subjects <- 500 @@ -130,7 +131,7 @@ if (!requireNamespace("OpenMx", quietly = TRUE)) { ``` ```{r} -require(dplyr) +library(dplyr) selVars <- c("ht1", "ht2") diff --git a/vignettes/v2_pedigree.Rmd b/vignettes/v2_pedigree.Rmd index 92ea4f7c..5f0f3698 100644 --- a/vignettes/v2_pedigree.Rmd +++ b/vignettes/v2_pedigree.Rmd @@ -51,6 +51,7 @@ The columns represents the individual's family ID, the individual's personal ID, ```{r} summarizeFamilies(df_ped, famID = "fam")$family_summary ``` + ## Plotting Pedigree Pedigrees are visual diagrams that represent family relationships across generations. They are commonly used in genetics to trace the inheritance of specific traits or conditions. This vignette will guide you through visualizing simulated pedigrees using the `plotPedigree` function. This function is a wrapper function for `Kinship2`'s base R plotting. The sister package ggpedigree has a much nicer plotting function. It's also available on CRAN, but it is not a dependency of BGmisc. If you want to use ggpedigree, you can install it with `install.packages("ggpedigree")` and then use `ggplot2` syntax to plot pedigrees. From 385c01fd69545d8730a5caff10efe541d7d05736 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Fri, 23 Jan 2026 10:17:25 -0500 Subject: [PATCH 03/27] Rename helpPedigree.R to helpSimulatePedigree.R Renamed the file to better reflect its purpose and improve code organization. --- R/{helpPedigree.R => helpSimulatePedigree.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{helpPedigree.R => helpSimulatePedigree.R} (100%) diff --git a/R/helpPedigree.R b/R/helpSimulatePedigree.R similarity index 100% rename from R/helpPedigree.R rename to R/helpSimulatePedigree.R From 18ff0e562163dc31a0b6200dfaf0df1ff0ff802d Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Fri, 23 Jan 2026 13:23:39 -0500 Subject: [PATCH 04/27] Optimize simulatePedigree within-generation logic Refactored simulatePedigree to support an optimized within-generation simulation via a new beta parameter. Added buildWithinGenerations_optimized and benchmarking script to compare performance. Also fixed error handling for insufficient single individuals. --- NEWS.md | 6 ++ R/simulatePedigree.R | 194 ++++++++++++++++++++++++++++++++++++++++-- data-raw/optimizing.R | 71 ++++++++++++++++ 3 files changed, 262 insertions(+), 9 deletions(-) create mode 100644 data-raw/optimizing.R diff --git a/NEWS.md b/NEWS.md index 6737cbd6..85ee2b84 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# BGmisc NEWS + +# BGmisc Dev +* Optimize Within Generation Pedigree simulatePedigree +* Fix error when not enough single people available + # BGmisc 1.5.2 * More flexible ID generation for simulatePedigree * Created ped2gen function to extract generation information from pedigree data.frames diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index 1c3948fd..226b11b1 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -6,13 +6,55 @@ #' @inheritParams simulatePedigree #' @inheritParams createGenDataFrame #' @return A data frame representing the simulated pedigree, including columns for family ID (`fam`), -buildWithinGenerations <- function(sizeGens, marR, sexR, Ngen, verbose = FALSE, - personID = "ID", - momID = "momID", - dadID = "dadID", - code_male = "M", - code_female = "F", - fam_shift = 1L) { + +buildWithinGenerations <- function(beta = FALSE, + sizeGens, marR, sexR, Ngen, verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F", + fam_shift = 1L) { + if (beta) { + df_Fam <- buildWithinGenerations_optimized( + sizeGens = sizeGens, + marR = marR, + sexR = sexR, + Ngen = Ngen, + verbose = verbose, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female, + fam_shift = fam_shift + ) + } else { + df_Fam <- buildWithinGenerations_old( + sizeGens = sizeGens, + marR = marR, + sexR = sexR, + Ngen = Ngen, + verbose = verbose, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female, + fam_shift = fam_shift + ) + } + return(df_Fam) +} + + +buildWithinGenerations_old <- function(sizeGens, marR, sexR, Ngen, verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F", + fam_shift = 1L) { idx_width <- nchar(max(sizeGens)) gen_width <- max(2L, nchar(Ngen)) fam_shift <- 1L @@ -125,6 +167,137 @@ buildWithinGenerations <- function(sizeGens, marR, sexR, Ngen, verbose = FALSE, } +buildWithinGenerations_optimized <- function(sizeGens, marR, sexR, Ngen, verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F", + fam_shift = 1L) { + idx_width <- nchar(max(sizeGens)) + gen_width <- max(2L, nchar(Ngen)) + + + # Precompute powers once + pow_idx <- 10^idx_width + pow_gen <- 10^(gen_width + idx_width) + + ## Connect male and female into couples in each generations + marR_crt <- (1 + marR) / 2 + + for (i in seq_len(Ngen)) { + # idGen <- as.numeric(paste(100, i, 1:sizeGens[i], sep = "")) + + idGen <- fam_shift * pow_gen + i * pow_idx + seq_len(sizeGens[i]) + + + ### For each generation, create a separate dataframe + df_Ngen <- createGenDataFrame( + sizeGens = sizeGens, + genIndex = i, + idGen = idGen + ) + + ### Let's deal with the sex in each generation first + + df_Ngen$sex <- determineSex( + idGen = idGen, sexR = sexR, + code_male = code_male, + code_female = code_female + ) + + # message(paste("tiger",i)) + # The first generation + if (i == 1) { + df_Ngen$spID[1] <- df_Ngen$id[2] + df_Ngen$spID[2] <- df_Ngen$id[1] + + df_Ngen$sex[1] <- code_female + df_Ngen$sex[2] <- code_male + } + + + usedFemaleIds <- numeric() + usedMaleIds <- numeric() + + # reserve the single persons + if (i != 1 && i != Ngen) { + totalFemale <- sum(df_Ngen$sex == code_female) + totalMale <- sum(df_Ngen$sex == code_male) + nMarriedFemale <- round(totalFemale * marR_crt) + nMarriedMale <- round(totalMale * marR_crt) + # make sure there are same numbers of married males and females + if (nMarriedFemale >= nMarriedMale) { + nMarriedFemale <- nMarriedMale + } else { + nMarriedMale <- nMarriedFemale + } + # get the number of single males and females + nSingleFemale <- totalFemale - nMarriedFemale + nSingleMale <- totalMale - nMarriedMale + + + # sample single ids from male ids and female ids + if (nSingleFemale < 0) { + nSingleFemale <- 0 + message("Warning: Negative number of single women available; setting to 0") + usedFemaleIds <- numeric() + } else { + usedFemaleIds <- sample(df_Ngen$id[df_Ngen$sex == code_female], nSingleFemale) + } + ## message(c("Used F", usedFemaleIds)) + if (nSingleMale < 0) { + nSingleMale <- 0 + message("Warning: Negative number of single men available; setting to 0") + usedMaleIds <- numeric() + } else { + usedMaleIds <- sample(df_Ngen$id[df_Ngen$sex == code_male], nSingleMale) + } + ## message(c("Used M", usedMaleIds)) + # usedIds <- c(usedFemaleIds, usedMaleIds) + isUsed <- df_Ngen$id %in% c(usedFemaleIds, usedMaleIds) + + # Create spouses + for (j in seq_len(nrow(df_Ngen))) { + if (isUsed[j]) { + next + } else { + + if (df_Ngen$sex[j] == code_female) { + spouse_sex_code <- code_male + } else { + spouse_sex_code <- code_female + } + for (k in seq_len(nrow(df_Ngen))) { + + tgt <- (!isUsed[k]) & df_Ngen$sex[k] == spouse_sex_code + + if (tgt) { + df_Ngen$spID[j] <- df_Ngen$id[k] + df_Ngen$spID[k] <- df_Ngen$id[j] + + isUsed[j] <- TRUE + isUsed[k] <- TRUE + break + } else { + next + } + } + + } + # message(usedIds) + } + } + if (i == 1) { + df_Fam <- df_Ngen + } else { + df_Fam <- rbind(df_Fam, df_Ngen) + } + } + return(df_Fam) +} + + #' Process Generation Connections #' #' This function processes connections between each two generations in a pedigree simulation. @@ -421,7 +594,9 @@ simulatePedigree <- function(kpc = 3, spouseID = "spouseID", code_male = "M", code_female = "F", - fam_shift = 1L) { + fam_shift = 1L, + beta = FALSE +) { # SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations # SexRatio <- sexR / (1 - sexR) @@ -444,7 +619,8 @@ simulatePedigree <- function(kpc = 3, dadID = dadID, code_male = code_male, code_female = code_female, - fam_shift = fam_shift + fam_shift = fam_shift, + beta = beta ) if (verbose == TRUE) { message( diff --git a/data-raw/optimizing.R b/data-raw/optimizing.R new file mode 100644 index 00000000..a66983ca --- /dev/null +++ b/data-raw/optimizing.R @@ -0,0 +1,71 @@ +library(profvis) +library(microbenchmark) +library(tidyverse) +set.seed(6) +Ngen <- 3 +kpc <- 3 +sexR <- .55 +marR <- .8 +reps <- 40 +if (FALSE) { + profvis({ + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) + }) + + profvis({ + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) + }) +} + + +benchmark_results <- microbenchmark( + beta_false_lowgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_lowgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) + }, + + beta_false_midgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_midgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = TRUE) + }, + + beta_false_highgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_highgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = TRUE) + }, + times = reps # Run each method 10 times +) + +benchmark_results <- benchmark_results %>% + mutate( + beta = factor(ifelse(grepl("beta_true", expr), TRUE, FALSE)), + gen = factor(case_when( + grepl("lowgen", expr) ~ Ngen, + grepl("midgen", expr) ~ Ngen * 2, + grepl("highgen", expr) ~ Ngen * 3 + )) + ) + +summary(benchmark_results) +lm(benchmark_results$time ~ benchmark_results$beta * benchmark_results$gen) %>% + summary() + + +# log transform time for better visualization + +ggplot(benchmark_results, aes(x = gen, y = time / 1e6, color = beta)) + + geom_boxplot() + + labs( + title = "Benchmarking simulatePedigree() with and without beta parameter", + x = "Generation Size", + y = "Execution Time (ms)", + color = "Beta Parameter" + ) + + theme_minimal() + + scale_y_log10() From e5988bb133af45fd083cd499291996b51a82cf1a Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Fri, 23 Jan 2026 17:31:45 -0500 Subject: [PATCH 05/27] Update simulatePedigree.R --- R/simulatePedigree.R | 253 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 251 insertions(+), 2 deletions(-) diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index 226b11b1..be380d8a 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -326,7 +326,255 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose = FALSE, mar rd_kpc, personID = "ID", momID = "momID", dadID = "dadID", - code_male = "M", code_female = "F") { + code_male = "M", code_female = "F", beta = FALSE) { + if (beta) { + df_Fam <- buildBetweenGenerations_optimized( + df_Fam = df_Fam, + Ngen = Ngen, + sizeGens = sizeGens, + verbose = verbose, + marR = marR, + sexR = sexR, + kpc = kpc, + rd_kpc = rd_kpc, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female + ) + } else { + df_Fam <- buildBetweenGenerations_old( + df_Fam = df_Fam, + Ngen = Ngen, + sizeGens = sizeGens, + verbose = verbose, + marR = marR, + sexR = sexR, + kpc = kpc, + rd_kpc = rd_kpc, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female + ) + } + return(df_Fam) +} + + +buildBetweenGenerations_optimized <- function(df_Fam, + Ngen, + sizeGens, + verbose = FALSE, + marR, sexR, kpc, + rd_kpc, personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F") { + df_Fam$ifparent <- FALSE + df_Fam$ifson <- FALSE + df_Fam$ifdau <- FALSE + for (i in seq_len(Ngen)) { + # generation 1 doesn't need any mother and father + if (i == 1) { + df_Ngen <- df_Fam[df_Fam$gen == i, ] + df_Ngen$ifparent <- TRUE + df_Ngen$ifson <- FALSE + df_Ngen$ifdau <- FALSE + df_Fam[df_Fam$gen == i, ] <- df_Ngen + } else { + # calculate the number of couples in the i-1 th generation + N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[df_Fam$gen == i - 1]))) * 0.5 + # calculate the number of members in the i th generation that have a link to the couples in the i-1 th generation + N_LinkedMem <- N_couples * kpc + # decompose the linked members into females and males respectively + N_LinkedFemale <- round(N_LinkedMem * (1 - sexR)) + N_LinkedMale <- N_LinkedMem - N_LinkedFemale + + # Create a pool for used male children and female children respectively + usedFemaleIds <- numeric() + usedMaleIds <- numeric() + usedIds <- c(usedFemaleIds, usedMaleIds) + + # get the df for the i the generation + df_Ngen <- df_Fam[df_Fam$gen == i, ] + df_Ngen$ifparent <- FALSE + df_Ngen$ifson <- FALSE + df_Ngen$ifdau <- FALSE + df_Ngen$coupleId <- NA_character_ + df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), ] + + # Start to connect children with mother and father + # + if (verbose == TRUE) { + message( + "Step 2.1: mark a group of potential sons and daughters in the i th generation" + ) + } + + # try to rewrite the code + # count the number of couples in the i th gen + countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5 + + # Now, assign couple IDs for the current generation + df_Ngen <- assignCoupleIds(df_Ngen) + + # get the number of linked female and male children after excluding the single children + # get a vector of single person id in the ith generation + IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)] + SingleF <- sum(df_Ngen$sex == code_female & is.na(df_Ngen$spID)) + CoupleF <- N_LinkedFemale - SingleF + SingleM <- sum(df_Ngen$sex == code_male & is.na(df_Ngen$spID)) + # CoupleM <- N_LinkedMale - SingleM + + df_Fam[df_Fam$gen == i, ] <- markPotentialChildren( + df_Ngen = df_Ngen, + i = i, + Ngen = Ngen, + sizeGens = sizeGens, + CoupleF = CoupleF, + code_male = code_male, + code_female = code_female + ) + if (verbose == TRUE) { + message( + "Step 2.2: mark a group of potential parents in the i-1 th generation" + ) + } + df_Ngen <- df_Fam[df_Fam$gen == i - 1, ] + df_Ngen$ifparent <- FALSE + df_Ngen$ifson <- FALSE + df_Ngen$ifdau <- FALSE + df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), ] + # Create a pool for the used parents + usedParentIds <- numeric() + + for (k in 1:sizeGens[i - 1]) { + # first check if the number of married couples surpass the marriage rate + if (sum(df_Ngen$ifparent) / nrow(df_Ngen) >= marR) { + break + } else { + # check if the id is used and if the member has married + if (!(df_Ngen$id[k] %in% usedParentIds) && !is.na(df_Ngen$spID[k])) { + df_Ngen$ifparent[k] <- TRUE + df_Ngen$ifparent[df_Ngen$spID == df_Ngen$id[k]] <- TRUE + usedParentIds <- c(usedParentIds, df_Ngen$id[k], df_Ngen$spID[k]) + } else { + next + } + } + } + + df_Ngen <- df_Ngen[order(as.numeric(rownames(df_Ngen))), , drop = FALSE] + df_Fam[df_Fam$gen == i - 1, ] <- df_Ngen + if (verbose == TRUE) { + message( + "Step 2.3: connect the i and i-1 th generation" + ) + } + if (i == 1) { + next + } else { + # get the df for i and i-1 th generations + df_Ngen <- df_Fam[df_Fam$gen %in% c(i, i - 1), ] + sizeI <- sizeGens[i - 1] + sizeII <- sizeGens[i] + # create a vector with ordered ids that should be connected to a parent + # message(df_Ngen) + IdSon <- df_Ngen$id[df_Ngen$ifson == TRUE & df_Ngen$gen == i] + # message(IdSon) + IdDau <- df_Ngen$id[df_Ngen$ifdau == TRUE & df_Ngen$gen == i] + # message(IdDau) + IdOfp <- evenInsert(IdSon, IdDau) + + # generate link kids to the couples + random_numbers <- adjustKidsPerCouple( + nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc, + rd_kpc = rd_kpc + ) + + # cat("final random numbers",random_numbers, "\n") + # cat("mean",sum(random_numbers)/length(random_numbers), "\n") + # create two vectors for maId and paId; replicate the ids to match the same length as IdOfp + IdMa <- numeric() + IdPa <- numeric() + usedIds <- numeric() + idx <- 1 + + for (l in 1:sizeI) { + # check if the id is used + if (!df_Ngen$id[l] %in% usedIds) { + # check if the member can be a parent + if (df_Ngen$ifparent[l] == TRUE && df_Ngen$sex[l] == code_female) { + usedIds <- c(usedIds, df_Ngen$id[l], df_Ngen$spID[l]) + IdMa <- c(IdMa, rep(df_Ngen$id[l], random_numbers[idx])) + IdPa <- c(IdPa, rep(df_Ngen$spID[l], random_numbers[idx])) + idx <- idx + 1 + } else if (df_Ngen$ifparent[l] == TRUE && df_Ngen$sex[l] == code_male) { + usedIds <- c(usedIds, df_Ngen$id[l], df_Ngen$spID[l]) + IdPa <- c(IdPa, rep(df_Ngen$id[l], random_numbers[idx])) + IdMa <- c(IdMa, rep(df_Ngen$spID[l], random_numbers[idx])) + idx <- idx + 1 + } else { + next + } + } else { + next + } + } + + # the length of IdMa and IdPa can be longer than the vector of offspring, so truncated it + ### making sure sampling out the single people instead of couples + if (length(IdPa) - length(IdOfp) > 0) { + if (verbose == TRUE) { + message("length of IdPa", length(IdPa), "\n") + } + IdRm <- sample.int(length(IdPa), size = length(IdPa) - length(IdOfp)) + IdPa <- IdPa[-IdRm] + IdMa <- IdMa[-IdRm] + } else if (length(IdPa) - length(IdOfp) < 0) { + # cat("length of IdOfp", length(IdOfp), "\n") + # cat("length of IdPa", length(IdPa), "\n") + # cat("length of IdSingle", length(IdMa), "\n") + IdRm <- resample(IdSingle, size = length(IdOfp) - length(IdPa)) + + IdOfp <- IdOfp[!(IdOfp %in% IdRm)] + } + # if (length(IdMa)- length(IdOfp) > 0){ + # IdRm <- sample.int(length(IdMa),size =length(IdMa)-length(IdOfp)) + # IdPa <- IdPa[-IdRm] + # IdMa <- IdMa[-IdRm] + # }else if (length(IdMa)-length(IdOfp) < 0) { + # IdRm <- sample.int(length(IdOfp),size =length(IdOfp)-length(IdMa)) + # IdOfp <- IdOfp[-IdRm] + # } + # message(matrix(c(IdPa, IdMa), ncol = 2)) + + # message(IdPa) + # message(IdOfp) + + # put the IdMa and IdPa into the dfFam with correspondent OfpId + for (m in seq_along(IdOfp)) { + df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- IdPa[m] + df_Ngen[df_Ngen$id == IdOfp[m], "mat"] <- IdMa[m] + } + # message(df_Ngen) + df_Fam[df_Fam$gen == i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[df_Fam$gen == i - 1, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + } + } + } + return(df_Fam) +} + +buildBetweenGenerations_old <- function(df_Fam, Ngen, sizeGens, verbose = FALSE, marR, sexR, kpc, + rd_kpc, personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", code_female = "F") { df_Fam$ifparent <- FALSE df_Fam$ifson <- FALSE df_Fam$ifdau <- FALSE @@ -641,7 +889,8 @@ simulatePedigree <- function(kpc = 3, momID = momID, dadID = dadID, code_male = code_male, - code_female = code_female + code_female = code_female, + beta = beta ) df_Fam <- df_Fam[, 1:7] From d6bbd7125d60182d8de85bf0e9d9583a27c82a3d Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Fri, 23 Jan 2026 17:37:14 -0500 Subject: [PATCH 06/27] DOC --- man/adjustKidsPerCouple.Rd | 2 +- man/assignCoupleIDs.Rd | 2 +- man/buildBetweenGenerations.Rd | 3 +- man/buildWithinGenerations.Rd | 1 + man/createGenDataFrame.Rd | 2 +- man/determineSex.Rd | 2 +- man/markPotentialChildren.Rd | 2 +- man/ped2add.Rd | 4 +- man/ped2cn.Rd | 4 +- man/ped2com.Rd | 4 +- man/ped2gen.Rd | 4 +- man/ped2mit.Rd | 4 +- man/simulatePedigree.Rd | 3 +- vignettes/v1_modelingvariancecomponents.html | 197 ++++++++------- vignettes/v2_pedigree.html | 49 ++-- vignettes/v4_validation.html | 239 ++++++++++++------- 16 files changed, 283 insertions(+), 239 deletions(-) diff --git a/man/adjustKidsPerCouple.Rd b/man/adjustKidsPerCouple.Rd index a3019c45..d840a625 100644 --- a/man/adjustKidsPerCouple.Rd +++ b/man/adjustKidsPerCouple.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R \name{adjustKidsPerCouple} \alias{adjustKidsPerCouple} \title{Generate or Adjust Number of Kids per Couple Based on Mating Rate} diff --git a/man/assignCoupleIDs.Rd b/man/assignCoupleIDs.Rd index d1de671e..22094029 100644 --- a/man/assignCoupleIDs.Rd +++ b/man/assignCoupleIDs.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R \name{assignCoupleIDs} \alias{assignCoupleIDs} \alias{assignCoupleIds} diff --git a/man/buildBetweenGenerations.Rd b/man/buildBetweenGenerations.Rd index 4e297579..cb0c6232 100644 --- a/man/buildBetweenGenerations.Rd +++ b/man/buildBetweenGenerations.Rd @@ -17,7 +17,8 @@ buildBetweenGenerations( momID = "momID", dadID = "dadID", code_male = "M", - code_female = "F" + code_female = "F", + beta = FALSE ) } \arguments{ diff --git a/man/buildWithinGenerations.Rd b/man/buildWithinGenerations.Rd index 157faed6..93508575 100644 --- a/man/buildWithinGenerations.Rd +++ b/man/buildWithinGenerations.Rd @@ -5,6 +5,7 @@ \title{Process Generations for Pedigree Simulation} \usage{ buildWithinGenerations( + beta = FALSE, sizeGens, marR, sexR, diff --git a/man/createGenDataFrame.Rd b/man/createGenDataFrame.Rd index 014ee86c..ce14ffe6 100644 --- a/man/createGenDataFrame.Rd +++ b/man/createGenDataFrame.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R \name{createGenDataFrame} \alias{createGenDataFrame} \title{Create Data Frame for Generation} diff --git a/man/determineSex.Rd b/man/determineSex.Rd index 39711ada..b56bb59d 100644 --- a/man/determineSex.Rd +++ b/man/determineSex.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R \name{determineSex} \alias{determineSex} \title{Determine Sex of Offspring} diff --git a/man/markPotentialChildren.Rd b/man/markPotentialChildren.Rd index a51e496c..e63721fd 100644 --- a/man/markPotentialChildren.Rd +++ b/man/markPotentialChildren.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R \name{markPotentialChildren} \alias{markPotentialChildren} \title{Mark and Assign children} diff --git a/man/ped2add.Rd b/man/ped2add.Rd index e26c100e..dc2fee88 100644 --- a/man/ped2add.Rd +++ b/man/ped2add.Rd @@ -27,9 +27,7 @@ ped2add( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2cn.Rd b/man/ped2cn.Rd index 69062b07..aafa5f7e 100644 --- a/man/ped2cn.Rd +++ b/man/ped2cn.Rd @@ -27,9 +27,7 @@ ped2cn( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2com.Rd b/man/ped2com.Rd index c69574d6..f34d6022 100644 --- a/man/ped2com.Rd +++ b/man/ped2com.Rd @@ -33,9 +33,7 @@ ped2com( \item{component}{character. Which component of the pedigree to return. See Details.} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2gen.Rd b/man/ped2gen.Rd index cda9f589..d57facbf 100644 --- a/man/ped2gen.Rd +++ b/man/ped2gen.Rd @@ -27,9 +27,7 @@ ped2gen( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2mit.Rd b/man/ped2mit.Rd index 0f449b00..b4992fb9 100644 --- a/man/ped2mit.Rd +++ b/man/ped2mit.Rd @@ -28,9 +28,7 @@ ped2mit( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index 36555c8e..c1b656ee 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -25,7 +25,8 @@ simulatePedigree( spouseID = "spouseID", code_male = "M", code_female = "F", - fam_shift = 1L + fam_shift = 1L, + beta = FALSE ) SimPed(...) diff --git a/vignettes/v1_modelingvariancecomponents.html b/vignettes/v1_modelingvariancecomponents.html index 84a077f9..fdb9da14 100644 --- a/vignettes/v1_modelingvariancecomponents.html +++ b/vignettes/v1_modelingvariancecomponents.html @@ -343,10 +343,10 @@

Modeling variance components

Introduction

-

This vignette provides a detailed guide to specific functions within -the BGmisc package that aid in the identification and -fitting of variance component models common in behavior genetics. We -will explore key functions such as identifyComponentModel, +

This vignette provides a detailed guide to specific functions in the +BGmisc package that aid in the identification and fitting +of variance component models common in behavior genetics. We will +explore key functions such as identifyComponentModel, providing practical examples and theoretical background. Identification ensures a unique set of parameters that define the model-implied covariance matrix, preventing free parameters from trading off one @@ -421,113 +421,106 @@

Using identifyComponentModel Function

#> character(0)

As you can see the model is identified, now that we’ve added another group. Let us confirm by fitting a model. First we prepare the data.

-
require(dplyr)
-#> Loading required package: dplyr
-#> 
-#> Attaching package: 'dplyr'
-#> The following objects are masked from 'package:stats':
-#> 
-#>     filter, lag
-#> The following objects are masked from 'package:base':
-#> 
-#>     intersect, setdiff, setequal, union
-
+
library(dplyr)
+
+
+selVars <- c("ht1", "ht2")
+
+mzdzData <- subset(
+  twinData, zyg %in% c(1, 3),
+  c(selVars, "zyg")
+)
+
+mzdzData$RCoef <- c(1, NA, .5)[mzdzData$zyg]
 
-selVars <- c("ht1", "ht2")
-
-mzdzData <- subset(
-  twinData, zyg %in% c(1, 3),
-  c(selVars, "zyg")
-)
-
-mzdzData$RCoef <- c(1, NA, .5)[mzdzData$zyg]
-
-
-mzData <- mzdzData %>% filter(zyg == 1)
+ +mzData <- mzdzData %>% filter(zyg == 1)

Let us fit the data with MZ twins by themselves.

if (!requireNamespace("EasyMx", quietly = TRUE)) {
- print("Please install EasyMx to run the model fitting examples.")
+  print("Please install EasyMx to run the model fitting examples.")
 } else {
-run1 <- emxTwinModel(
-  model = "Cholesky",
-  relatedness = "RCoef",
-  data = mzData,
-  use = selVars,
-  run = TRUE, name = "TwCh"
-)
-
-summary(run1)
-}
-#> Running TwCh with 4 parameters
-#> Summary of TwCh 
-#>  
-#> free parameters:
-#>      name matrix row col   Estimate    Std.Error A lbound ubound
-#> 1 sqrtA11  sqrtA   1   1 0.05122646           NA    1e-06       
-#> 2 sqrtC11  sqrtC   1   1 0.03518629           NA       0!       
-#> 3 sqrtE11  sqrtE   1   1 0.02325722 0.0007017955 !     0!       
-#> 4    Mht1  Means ht1   1 1.62974908 0.0027023907                
-#> 
-#> Model Statistics: 
-#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
-#>        Model:              4                   1112             -3693.148
-#>    Saturated:              5                   1111                    NA
-#> Independence:              4                   1112                    NA
-#> Number of observations/statistics: 569/1116
-#> 
-#> Information Criteria: 
-#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
-#> AIC:      -5917.148              -3685.148                -3685.078
-#> BIC:     -10747.543              -3667.773                -3680.471
-#> To get additional fit indices, see help(mxRefModels)
-#> timestamp: 2026-01-07 10:34:22 
-#> Wall clock time: 0.0555501 secs 
-#> optimizer:  SLSQP 
-#> OpenMx version number: 2.22.10 
-#> Need help?  See help(mxSummary)
+ library(EasyMx) + run1 <- emxTwinModel( + model = "Cholesky", + relatedness = "RCoef", + data = mzData, + use = selVars, + run = TRUE, name = "TwCh" + ) + + summary(run1) +} +#> Running TwCh with 4 parameters +#> Summary of TwCh +#> +#> free parameters: +#> name matrix row col Estimate Std.Error A lbound ubound +#> 1 sqrtA11 sqrtA 1 1 0.05122646 NA 1e-06 +#> 2 sqrtC11 sqrtC 1 1 0.03518629 NA 0! +#> 3 sqrtE11 sqrtE 1 1 0.02325722 0.0007017955 ! 0! +#> 4 Mht1 Means ht1 1 1.62974908 0.0027023907 +#> +#> Model Statistics: +#> | Parameters | Degrees of Freedom | Fit (-2lnL units) +#> Model: 4 1112 -3693.148 +#> Saturated: 5 1111 NA +#> Independence: 4 1112 NA +#> Number of observations/statistics: 569/1116 +#> +#> Information Criteria: +#> | df Penalty | Parameters Penalty | Sample-Size Adjusted +#> AIC: -5917.148 -3685.148 -3685.078 +#> BIC: -10747.543 -3667.773 -3680.471 +#> To get additional fit indices, see help(mxRefModels) +#> timestamp: 2026-01-23 17:36:46 +#> Wall clock time: 0.05960393 secs +#> optimizer: SLSQP +#> OpenMx version number: 2.22.10 +#> Need help? See help(mxSummary)

As you can see the model was unsuccessful because it was not identified. But when we add another group, so that the model is identified, the model now fits.

if (!requireNamespace("EasyMx", quietly = TRUE)) {
- print("Please install EasyMx to run the model fitting examples.")
+  print("Please install EasyMx to run the model fitting examples.")
 } else {
-run2 <- emxTwinModel(
-  model = "Cholesky",
-  relatedness = "RCoef",
-  data = mzdzData,
-  use = selVars,
-  run = TRUE, name = "TwCh"
-)
-
-summary(run2)
-}
-#> Running TwCh with 4 parameters
-#> Summary of TwCh 
-#>  
-#> free parameters:
-#>      name matrix row col   Estimate    Std.Error A lbound ubound
-#> 1 sqrtA11  sqrtA   1   1 0.06339271 0.0014377690    1e-06       
-#> 2 sqrtC11  sqrtC   1   1 0.00000100 0.0250260004 !     0!       
-#> 3 sqrtE11  sqrtE   1   1 0.02330040 0.0007015267       0!       
-#> 4    Mht1  Means ht1   1 1.63295540 0.0020511844                
-#> 
-#> Model Statistics: 
-#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
-#>        Model:              4                   1803             -5507.092
-#>    Saturated:              5                   1802                    NA
-#> Independence:              4                   1803                    NA
-#> Number of observations/statistics: 920/1807
-#> 
-#> Information Criteria: 
-#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
-#> AIC:      -9113.092              -5499.092                -5499.048
-#> BIC:     -17811.437              -5479.794                -5492.498
-#> To get additional fit indices, see help(mxRefModels)
-#> timestamp: 2026-01-07 10:34:22 
-#> Wall clock time: 0.04742312 secs 
-#> optimizer:  SLSQP 
-#> OpenMx version number: 2.22.10 
-#> Need help?  See help(mxSummary)
+ library(EasyMx) + run2 <- emxTwinModel( + model = "Cholesky", + relatedness = "RCoef", + data = mzdzData, + use = selVars, + run = TRUE, name = "TwCh" + ) + + summary(run2) +} +#> Running TwCh with 4 parameters +#> Summary of TwCh +#> +#> free parameters: +#> name matrix row col Estimate Std.Error A lbound ubound +#> 1 sqrtA11 sqrtA 1 1 0.06339271 0.0014377690 1e-06 +#> 2 sqrtC11 sqrtC 1 1 0.00000100 0.0250260004 ! 0! +#> 3 sqrtE11 sqrtE 1 1 0.02330040 0.0007015267 0! +#> 4 Mht1 Means ht1 1 1.63295540 0.0020511844 +#> +#> Model Statistics: +#> | Parameters | Degrees of Freedom | Fit (-2lnL units) +#> Model: 4 1803 -5507.092 +#> Saturated: 5 1802 NA +#> Independence: 4 1803 NA +#> Number of observations/statistics: 920/1807 +#> +#> Information Criteria: +#> | df Penalty | Parameters Penalty | Sample-Size Adjusted +#> AIC: -9113.092 -5499.092 -5499.048 +#> BIC: -17811.437 -5479.794 -5492.498 +#> To get additional fit indices, see help(mxRefModels) +#> timestamp: 2026-01-23 17:36:46 +#> Wall clock time: 0.04078603 secs +#> optimizer: SLSQP +#> OpenMx version number: 2.22.10 +#> Need help? See help(mxSummary) diff --git a/vignettes/v2_pedigree.html b/vignettes/v2_pedigree.html index b9c6dac0..b8f34803 100644 --- a/vignettes/v2_pedigree.html +++ b/vignettes/v2_pedigree.html @@ -48,8 +48,9 @@ }