Metadata Helpers: Labels, Types, and Spec Application

herald’s metadata system is a set of composable operations that bridge a herald_spec to a data frame. apply_spec() runs all six steps in one call. The individual functions (scaffold_vars(), drop_unspec_vars(), coerce_types(), order_cols(), sort_keys(), decode_var()) provide granular control when you need it.

The six operations

1. scaffold_vars()    — add typed NA columns for spec variables missing from data
2. drop_unspec_vars() — remove columns not in the spec
3. coerce_types()     — coerce column types to match spec (char↔num, factor→char)
4. order_cols()       — reorder columns to match spec variable order
5. sort_keys()          — sort rows by key variables from ds_spec$keys
6. set_spec_attrs()   — set all labels, formats, lengths, and dataset label

apply_spec() applies these in order, transactionally — if any step fails, the original data frame is unchanged.

Shared fixture

spec <- herald_spec(
  ds_spec = data.frame(
    dataset = "DM",
    label   = "Demographics",
    keys    = "STUDYID, USUBJID",
    stringsAsFactors = FALSE
  ),
  var_spec = data.frame(
    dataset   = c("DM","DM","DM","DM"),
    variable  = c("STUDYID","USUBJID","AGE","SEX"),
    label     = c("Study Identifier","Unique Subject Identifier","Age","Sex"),
    data_type = c("text","text","integer","text"),
    length    = c(12L,11L,8L,1L),
    order     = c(1L,2L,3L,4L),
    stringsAsFactors = FALSE
  ),
  codelist = data.frame(
    codelist_id   = c("SEX","SEX"),
    term          = c("M","F"),
    decoded_value = c("Male","Female"),
    stringsAsFactors = FALSE
  )
)

scaffold_vars()

Adds typed NA columns for any variable in the spec that is missing from the data frame. The type (character vs numeric) is inferred from data_type.

# Data is missing STUDYID
partial_dm <- data.frame(
  USUBJID = c("01-701-1015", "01-701-1023"),
  AGE     = c(63L, 64L),
  SEX     = c("F", "M"),
  stringsAsFactors = FALSE
)

dm_scaffolded <- suppressMessages(scaffold_vars(partial_dm, spec, "DM"))
names(dm_scaffolded)
#> [1] "USUBJID" "AGE"     "SEX"     "STUDYID"
dm_scaffolded$STUDYID          # NA_character_ (spec says data_type = "text")
#> [1] NA NA
class(dm_scaffolded$STUDYID)   # "character"
#> [1] "character"

Text variables (date, datetime, string) become NA_character_; numeric variables (integer, float, decimal) become NA_real_.

drop_unspec_vars()

Removes columns not listed in the spec. Use this to trim scratch columns or SAS macro artifacts from your data before submission.

dm_extra <- data.frame(
  STUDYID = "CDISCPILOT01",
  USUBJID = "01-701-1015",
  AGE     = 63L,
  SEX     = "F",
  INTERNAL_FLAG = "Y",    # not in spec
  SCRATCH_COL   = 99L,    # not in spec
  stringsAsFactors = FALSE
)

dm_clean <- suppressMessages(drop_unspec_vars(dm_extra, spec, "DM"))
names(dm_clean)   # only spec variables remain
#> [1] "STUDYID" "USUBJID" "AGE"     "SEX"

coerce_types()

Coerces column types to match the spec. Handles character → numeric, numeric → character, and factor → character.

# AGE arrived as character from a CSV import
dm_csv <- data.frame(
  STUDYID = "CDISCPILOT01",
  USUBJID = "01-701-1015",
  AGE     = "63",           # character, spec says integer
  SEX     = "F",
  stringsAsFactors = FALSE
)

dm_coerced <- suppressMessages(coerce_types(dm_csv, spec, "DM"))
class(dm_coerced$AGE)   # "numeric" — coerced from character
#> [1] "numeric"
dm_coerced$AGE
#> [1] 63

Warning on NA introduction

# "UNKNOWN" cannot be coerced to numeric
dm_bad_age <- data.frame(
  STUDYID = "CDISCPILOT01",
  USUBJID = "01-701-1015",
  AGE     = "UNKNOWN",
  SEX     = "F",
  stringsAsFactors = FALSE
)

dm_coerced2 <- coerce_types(dm_bad_age, spec, "DM")
#> Warning: Coercing `AGE` to numeric introduced 1 NA.
#> ✖ Non-numeric value: "UNKNOWN"
#> ℹ These values became "NA".
#> Coerced 1 variable: AGE (char→num)
is.na(dm_coerced2$AGE)   # TRUE — warning was emitted
#> [1] TRUE

Attributes are preserved during coercion

attr(dm_csv$AGE, "label") <- "Age"
dm_coerced3 <- suppressMessages(coerce_types(dm_csv, spec, "DM"))
attr(dm_coerced3$AGE, "label")   # "Age" — label survived coercion
#> [1] "Age"

order_cols()

Reorders columns to match the order column in var_spec. Variables not in the spec trail at the end with a warning.

dm_shuffled <- data.frame(
  SEX     = "F",
  AGE     = 63L,
  USUBJID = "01-701-1015",
  STUDYID = "CDISCPILOT01",
  stringsAsFactors = FALSE
)

dm_ordered <- suppressMessages(order_cols(dm_shuffled, spec, "DM"))
names(dm_ordered)  # STUDYID, USUBJID, AGE, SEX — spec order
#> [1] "STUDYID" "USUBJID" "AGE"     "SEX"

sort_keys()

Sorts rows by the key variables listed in ds_spec$keys. Also sets the herald.sort_keys attribute so write_xpt() and write_json() know the data is already sorted.

dm_unsorted <- data.frame(
  STUDYID = rep("CDISCPILOT01", 3L),
  USUBJID = c("01-701-1028", "01-701-1015", "01-701-1023"),  # out of order
  AGE     = c(71L, 63L, 64L),
  SEX     = c("M", "F", "M"),
  stringsAsFactors = FALSE
)

dm_sorted <- sort_keys(dm_unsorted, spec, "DM")
dm_sorted$USUBJID                        # now in ascending order
#> [1] "01-701-1015" "01-701-1023" "01-701-1028"
attr(dm_sorted, "herald.sort_keys")      # c("STUDYID", "USUBJID")
#> [1] "STUDYID" "USUBJID"

decode_var()

Decodes a coded column using a codelist from the spec. Creates a new column with the decoded values. Warns about codes not found in the codelist.

dm_coded <- data.frame(
  STUDYID = rep("CDISCPILOT01", 3L),
  USUBJID = c("01-701-1015", "01-701-1023", "01-701-1028"),
  SEX     = c("F", "M", "M"),
  stringsAsFactors = FALSE
)

# Add codelist_id to var_spec so decode_var knows which codelist to use
spec_with_cl <- herald_spec(
  ds_spec  = spec$ds_spec,
  var_spec = cbind(spec$var_spec,
                   codelist_id = c(NA, NA, NA, "SEX"),
                   stringsAsFactors = FALSE),
  codelist = spec$codelist
)

dm_decoded <- decode_var(dm_coded, spec_with_cl, "DM", from = "SEX", to = "SEXDCD")
dm_decoded[, c("SEX", "SEXDCD")]
#>   SEX SEXDCD
#> 1   F Female
#> 2   M   Male
#> 3   M   Male

apply_spec() — all six steps in one call

apply_spec() is idempotent and transactional: - Idempotent: calling it twice produces the same result as calling it once. - Transactional: if any step errors, the original data frame is returned unchanged.

dm_raw <- data.frame(
  USUBJID = c("01-701-1028", "01-701-1015", "01-701-1023"),  # unsorted
  AGE     = c("71", "63", "64"),    # character, spec says integer
  SEX     = c("M", "F", "M"),
  SCRATCH = "delete me",            # not in spec
  stringsAsFactors = FALSE
)

dm_ready <- suppressMessages(apply_spec(dm_raw, spec, "DM"))

# All six operations applied:
names(dm_ready)                      # STUDYID(scaffolded), USUBJID, AGE, SEX
#> [1] "STUDYID" "USUBJID" "AGE"     "SEX"
class(dm_ready$AGE)                  # "numeric" — coerced
#> [1] "numeric"
dm_ready$STUDYID                     # NA_character_ — scaffolded
#> [1] NA NA NA
#> attr(,"label")
#> [1] "Study Identifier"
#> attr(,"sas.length")
#> [1] 12
dm_ready$USUBJID                     # sorted ascending
#> [1] "01-701-1015" "01-701-1023" "01-701-1028"
#> attr(,"label")
#> [1] "Unique Subject Identifier"
#> attr(,"sas.length")
#> [1] 11
attr(dm_ready$AGE,  "label")        # "Age"
#> [1] "Age"
attr(dm_ready,      "label")        # "Demographics"
#> [1] "Demographics"
attr(dm_ready,      "herald.sort_keys")  # c("STUDYID", "USUBJID")
#> [1] "STUDYID" "USUBJID"

Idempotency check

dm_twice <- suppressMessages(apply_spec(dm_ready, spec, "DM"))
identical(dm_twice$USUBJID, dm_ready$USUBJID)  # TRUE
#> [1] TRUE
identical(attr(dm_twice, "label"), attr(dm_ready, "label"))  # TRUE
#> [1] TRUE

Accepting a file path instead of a spec object

if (requireNamespace("jsonlite", quietly = TRUE)) {
  json_spec <- tempfile(fileext = ".json")
  write_spec(spec, json_spec)

  # apply_spec() reads the spec from file automatically
  dm_from_path <- suppressMessages(apply_spec(dm_raw, json_spec, "DM"))
  identical(names(dm_from_path), names(dm_ready))
}
#> [1] TRUE

Before vs After

Operation	xportr	herald
Set labels	`xportr::xportr_label(dm, meta, domain = "DM")`	`set_label(dm, AGE = "Age", ...)` or `apply_spec(dm, spec, "DM")`
Coerce types	`xportr::xportr_type(dm, meta, domain = "DM")`	`coerce_types(dm, spec, "DM")`
Set lengths	`xportr::xportr_length(dm, meta, domain = "DM")`	`set_length(dm, AGE = 8L, ...)`
Set formats	`xportr::xportr_format(dm, meta, domain = "DM")`	`set_format(dm, AGE = "8.")`
Reorder columns	`xportr::xportr_order(dm, meta, domain = "DM")`	`order_cols(dm, spec, "DM")`
All at once	5 separate `xportr_*` calls + a pipe	`apply_spec(dm, spec, "DM")`
Scaffold missing vars	Not available	`scaffold_vars(dm, spec, "DM")`
Decode coded values	Not available	`decode_var(dm, spec, "DM", "SEX", "SEXDCD")`
Sort by keys	Not available	`sort_keys(dm, spec, "DM")`