
Metadata Helpers: Labels, Types, and Spec Application
Source:vignettes/metadata-helpers.Rmd
metadata-helpers.Rmdherald’s metadata system is a set of composable operations that
bridge a herald_spec to a data frame.
apply_spec() runs all six steps in one call. The individual
functions (scaffold_vars(),
drop_unspec_vars(), coerce_types(),
order_cols(), sort_keys(),
decode_var()) provide granular control when you need
it.
The six operations
1. scaffold_vars() — add typed NA columns for spec variables missing from data
2. drop_unspec_vars() — remove columns not in the spec
3. coerce_types() — coerce column types to match spec (char↔num, factor→char)
4. order_cols() — reorder columns to match spec variable order
5. sort_keys() — sort rows by key variables from ds_spec$keys
6. set_spec_attrs() — set all labels, formats, lengths, and dataset label
apply_spec() applies these in order, transactionally —
if any step fails, the original data frame is unchanged.
Shared fixture
spec <- herald_spec(
ds_spec = data.frame(
dataset = "DM",
label = "Demographics",
keys = "STUDYID, USUBJID",
stringsAsFactors = FALSE
),
var_spec = data.frame(
dataset = c("DM","DM","DM","DM"),
variable = c("STUDYID","USUBJID","AGE","SEX"),
label = c("Study Identifier","Unique Subject Identifier","Age","Sex"),
data_type = c("text","text","integer","text"),
length = c(12L,11L,8L,1L),
order = c(1L,2L,3L,4L),
stringsAsFactors = FALSE
),
codelist = data.frame(
codelist_id = c("SEX","SEX"),
term = c("M","F"),
decoded_value = c("Male","Female"),
stringsAsFactors = FALSE
)
)scaffold_vars()
Adds typed NA columns for any variable in the spec that is missing
from the data frame. The type (character vs numeric) is inferred from
data_type.
# Data is missing STUDYID
partial_dm <- data.frame(
USUBJID = c("01-701-1015", "01-701-1023"),
AGE = c(63L, 64L),
SEX = c("F", "M"),
stringsAsFactors = FALSE
)
dm_scaffolded <- suppressMessages(scaffold_vars(partial_dm, spec, "DM"))
names(dm_scaffolded)
#> [1] "USUBJID" "AGE" "SEX" "STUDYID"
dm_scaffolded$STUDYID # NA_character_ (spec says data_type = "text")
#> [1] NA NA
class(dm_scaffolded$STUDYID) # "character"
#> [1] "character"Text variables (date, datetime, string) become
NA_character_; numeric variables (integer, float, decimal)
become NA_real_.
drop_unspec_vars()
Removes columns not listed in the spec. Use this to trim scratch columns or SAS macro artifacts from your data before submission.
dm_extra <- data.frame(
STUDYID = "CDISCPILOT01",
USUBJID = "01-701-1015",
AGE = 63L,
SEX = "F",
INTERNAL_FLAG = "Y", # not in spec
SCRATCH_COL = 99L, # not in spec
stringsAsFactors = FALSE
)
dm_clean <- suppressMessages(drop_unspec_vars(dm_extra, spec, "DM"))
names(dm_clean) # only spec variables remain
#> [1] "STUDYID" "USUBJID" "AGE" "SEX"coerce_types()
Coerces column types to match the spec. Handles character → numeric, numeric → character, and factor → character.
# AGE arrived as character from a CSV import
dm_csv <- data.frame(
STUDYID = "CDISCPILOT01",
USUBJID = "01-701-1015",
AGE = "63", # character, spec says integer
SEX = "F",
stringsAsFactors = FALSE
)
dm_coerced <- suppressMessages(coerce_types(dm_csv, spec, "DM"))
class(dm_coerced$AGE) # "numeric" — coerced from character
#> [1] "numeric"
dm_coerced$AGE
#> [1] 63Warning on NA introduction
# "UNKNOWN" cannot be coerced to numeric
dm_bad_age <- data.frame(
STUDYID = "CDISCPILOT01",
USUBJID = "01-701-1015",
AGE = "UNKNOWN",
SEX = "F",
stringsAsFactors = FALSE
)
dm_coerced2 <- coerce_types(dm_bad_age, spec, "DM")
#> Warning: Coercing `AGE` to numeric introduced 1 NA.
#> ✖ Non-numeric value: "UNKNOWN"
#> ℹ These values became "NA".
#> Coerced 1 variable: AGE (char→num)
is.na(dm_coerced2$AGE) # TRUE — warning was emitted
#> [1] TRUEAttributes are preserved during coercion
attr(dm_csv$AGE, "label") <- "Age"
dm_coerced3 <- suppressMessages(coerce_types(dm_csv, spec, "DM"))
attr(dm_coerced3$AGE, "label") # "Age" — label survived coercion
#> [1] "Age"order_cols()
Reorders columns to match the order column in
var_spec. Variables not in the spec trail at the end with a
warning.
dm_shuffled <- data.frame(
SEX = "F",
AGE = 63L,
USUBJID = "01-701-1015",
STUDYID = "CDISCPILOT01",
stringsAsFactors = FALSE
)
dm_ordered <- suppressMessages(order_cols(dm_shuffled, spec, "DM"))
names(dm_ordered) # STUDYID, USUBJID, AGE, SEX — spec order
#> [1] "STUDYID" "USUBJID" "AGE" "SEX"sort_keys()
Sorts rows by the key variables listed in ds_spec$keys.
Also sets the herald.sort_keys attribute so
write_xpt() and write_json() know the data is
already sorted.
dm_unsorted <- data.frame(
STUDYID = rep("CDISCPILOT01", 3L),
USUBJID = c("01-701-1028", "01-701-1015", "01-701-1023"), # out of order
AGE = c(71L, 63L, 64L),
SEX = c("M", "F", "M"),
stringsAsFactors = FALSE
)
dm_sorted <- sort_keys(dm_unsorted, spec, "DM")
dm_sorted$USUBJID # now in ascending order
#> [1] "01-701-1015" "01-701-1023" "01-701-1028"
attr(dm_sorted, "herald.sort_keys") # c("STUDYID", "USUBJID")
#> [1] "STUDYID" "USUBJID"decode_var()
Decodes a coded column using a codelist from the spec. Creates a new column with the decoded values. Warns about codes not found in the codelist.
dm_coded <- data.frame(
STUDYID = rep("CDISCPILOT01", 3L),
USUBJID = c("01-701-1015", "01-701-1023", "01-701-1028"),
SEX = c("F", "M", "M"),
stringsAsFactors = FALSE
)
# Add codelist_id to var_spec so decode_var knows which codelist to use
spec_with_cl <- herald_spec(
ds_spec = spec$ds_spec,
var_spec = cbind(spec$var_spec,
codelist_id = c(NA, NA, NA, "SEX"),
stringsAsFactors = FALSE),
codelist = spec$codelist
)
dm_decoded <- decode_var(dm_coded, spec_with_cl, "DM", from = "SEX", to = "SEXDCD")
dm_decoded[, c("SEX", "SEXDCD")]
#> SEX SEXDCD
#> 1 F Female
#> 2 M Male
#> 3 M Maleapply_spec() — all six steps in one call
apply_spec() is idempotent and transactional: -
Idempotent: calling it twice produces the same result as calling it
once. - Transactional: if any step errors, the original data frame is
returned unchanged.
dm_raw <- data.frame(
USUBJID = c("01-701-1028", "01-701-1015", "01-701-1023"), # unsorted
AGE = c("71", "63", "64"), # character, spec says integer
SEX = c("M", "F", "M"),
SCRATCH = "delete me", # not in spec
stringsAsFactors = FALSE
)
dm_ready <- suppressMessages(apply_spec(dm_raw, spec, "DM"))
# All six operations applied:
names(dm_ready) # STUDYID(scaffolded), USUBJID, AGE, SEX
#> [1] "STUDYID" "USUBJID" "AGE" "SEX"
class(dm_ready$AGE) # "numeric" — coerced
#> [1] "numeric"
dm_ready$STUDYID # NA_character_ — scaffolded
#> [1] NA NA NA
#> attr(,"label")
#> [1] "Study Identifier"
#> attr(,"sas.length")
#> [1] 12
dm_ready$USUBJID # sorted ascending
#> [1] "01-701-1015" "01-701-1023" "01-701-1028"
#> attr(,"label")
#> [1] "Unique Subject Identifier"
#> attr(,"sas.length")
#> [1] 11
attr(dm_ready$AGE, "label") # "Age"
#> [1] "Age"
attr(dm_ready, "label") # "Demographics"
#> [1] "Demographics"
attr(dm_ready, "herald.sort_keys") # c("STUDYID", "USUBJID")
#> [1] "STUDYID" "USUBJID"Idempotency check
dm_twice <- suppressMessages(apply_spec(dm_ready, spec, "DM"))
identical(dm_twice$USUBJID, dm_ready$USUBJID) # TRUE
#> [1] TRUE
identical(attr(dm_twice, "label"), attr(dm_ready, "label")) # TRUE
#> [1] TRUEAccepting a file path instead of a spec object
if (requireNamespace("jsonlite", quietly = TRUE)) {
json_spec <- tempfile(fileext = ".json")
write_spec(spec, json_spec)
# apply_spec() reads the spec from file automatically
dm_from_path <- suppressMessages(apply_spec(dm_raw, json_spec, "DM"))
identical(names(dm_from_path), names(dm_ready))
}
#> [1] TRUEBefore vs After
| Operation | xportr | herald |
|---|---|---|
| Set labels | xportr::xportr_label(dm, meta, domain = "DM") |
set_label(dm, AGE = "Age", ...) or
apply_spec(dm, spec, "DM")
|
| Coerce types | xportr::xportr_type(dm, meta, domain = "DM") |
coerce_types(dm, spec, "DM") |
| Set lengths | xportr::xportr_length(dm, meta, domain = "DM") |
set_length(dm, AGE = 8L, ...) |
| Set formats | xportr::xportr_format(dm, meta, domain = "DM") |
set_format(dm, AGE = "8.") |
| Reorder columns | xportr::xportr_order(dm, meta, domain = "DM") |
order_cols(dm, spec, "DM") |
| All at once | 5 separate xportr_* calls + a pipe |
apply_spec(dm, spec, "DM") |
| Scaffold missing vars | Not available | scaffold_vars(dm, spec, "DM") |
| Decode coded values | Not available | decode_var(dm, spec, "DM", "SEX", "SEXDCD") |
| Sort by keys | Not available | sort_keys(dm, spec, "DM") |
What to read next
-
vignette("spec-management")— building and reading specs -
vignette("xpt-io")— writing XPT files after applying spec -
vignette("submission-workflow")—submit()callsapply_spec()automatically