CDISC Dataset-JSON v1.1 is the modern, text-based alternative to XPT transport files. It stores datasets as structured JSON with embedded column metadata, making it diff-friendly, human-readable, and natively supported by the CDISC Dataset-JSON viewer. herald implements the full v1.1 specification.
Writing Dataset-JSON
skip <- !requireNamespace("jsonlite", quietly = TRUE)
library(herald)
dm <- data.frame(
STUDYID = rep("CDISCPILOT01", 3L),
USUBJID = c("01-701-1015", "01-701-1023", "01-701-1028"),
AGE = c(63L, 64L, 71L),
SEX = c("F", "M", "M"),
stringsAsFactors = FALSE
)
# Attach labels so they appear in the JSON columns metadata
attr(dm$STUDYID, "label") <- "Study Identifier"
attr(dm$USUBJID, "label") <- "Unique Subject Identifier"
attr(dm$AGE, "label") <- "Age"
attr(dm$SEX, "label") <- "Sex"
json_path <- tempfile(fileext = ".json")
write_json(dm, json_path, dataset = "DM", label = "Demographics")The resulting file is valid CDISC Dataset-JSON v1.1 with a flat
structure: datasetJSONVersion, name,
label, records, columns, and
rows at the top level. There is no
clinicalData nesting (that was v1.0).
raw <- jsonlite::fromJSON(json_path, simplifyVector = FALSE)
raw$datasetJSONVersion
#> [1] "1.1.0"
raw$name
#> [1] "DM"
raw$label
#> [1] "Demographics"
raw$records
#> [1] 3
length(raw$columns) # one entry per variable
#> [1] 4
raw$columns[[3]]$name
#> [1] "AGE"
raw$columns[[3]]$dataType
#> [1] "integer"write_json() returns the input data frame invisibly,
enabling pipes.
Reading Dataset-JSON
dm2 <- read_json(json_path)
nrow(dm2)
#> [1] 3
names(dm2)
#> [1] "STUDYID" "USUBJID" "AGE" "SEX"
# Metadata attributes are preserved
attr(dm2, "label") # "Demographics"
#> [1] "Demographics"
attr(dm2, "dataset_name") # "DM"
#> [1] "DM"
attr(dm2$STUDYID, "label") # "Study Identifier"
#> [1] "Study Identifier"
attr(dm2$AGE, "label") # "Age"
#> [1] "Age"The dataset_name attribute holds the name
field from the JSON (useful when reading a file without knowing the
dataset name in advance).
Handling NA values
ae <- data.frame(
STUDYID = "CDISCPILOT01",
USUBJID = "01-701-1015",
AETERM = c("HEADACHE", NA_character_, "NAUSEA"),
AESEQ = c(1L, NA_integer_, 3L),
stringsAsFactors = FALSE
)
json_ae <- tempfile(fileext = ".json")
write_json(ae, json_ae, dataset = "AE")
ae2 <- read_json(json_ae)
is.na(ae2$AETERM[2]) # TRUE — NA preserved
#> [1] TRUE
is.na(ae2$AESEQ[2]) # TRUE
#> [1] TRUEJSON null values round-trip to R NA.
Converting between XPT and Dataset-JSON
XPT → Dataset-JSON
xpt_path <- file.path(tempdir(), "dm.xpt")
# Write XPT (with labels attached from the write_json example above)
write_xpt(dm, xpt_path, label = "Demographics")
json_from_xpt <- tempfile(fileext = ".json")
xpt_to_json(xpt_path, json_from_xpt)
dm_from_json <- read_json(json_from_xpt)
identical(dm_from_json$STUDYID, dm$STUDYID)
#> [1] FALSEDataset-JSON → XPT
xpt_from_json <- tempfile(fileext = ".xpt")
json_to_xpt(json_path, xpt_from_json)
dm_from_xpt <- read_xpt(xpt_from_json)
identical(dm_from_xpt$STUDYID, dm$STUDYID)
#> [1] TRUEFull round-trip: XPT → JSON → XPT
xpt1 <- file.path(tempdir(), "dm_rt1.xpt")
json <- tempfile(fileext = ".json")
xpt2 <- file.path(tempdir(), "dm_rt2.xpt")
write_xpt(dm, xpt1)
xpt_to_json(xpt1, json)
json_to_xpt(json, xpt2)
dm_rt <- read_xpt(xpt2)
identical(dm_rt$STUDYID, dm$STUDYID)
#> [1] TRUE
identical(dm_rt$AGE, dm$AGE)
#> [1] FALSE
identical(dm_rt$SEX, dm$SEX)
#> [1] TRUEValidation with Dataset-JSON
validate() accepts JSON input with
format = "json". The same spec-driven conformance engine
runs regardless of input format:
vspec <- herald_spec(
ds_spec = data.frame(dataset = "DM", label = "Demographics",
stringsAsFactors = FALSE),
var_spec = data.frame(
dataset = rep("DM", 4L),
variable = c("STUDYID","USUBJID","AGE","SEX"),
label = c("Study Identifier","Unique Subject Identifier","Age","Sex"),
data_type = c("text","text","integer","text"),
length = c(12L,11L,8L,1L),
stringsAsFactors = FALSE
)
)
vjson_dir <- tempfile()
dir.create(vjson_dir)
write_json(dm, file.path(vjson_dir, "dm.json"), dataset = "DM",
label = "Demographics")
result <- validate(vjson_dir, spec = vspec, format = "json", rules = NULL)
result$summary
#> $reject
#> [1] 0
#>
#> $high
#> [1] 0
#>
#> $medium
#> [1] 0
#>
#> $low
#> [1] 0
#>
#> $total
#> [1] 0Empty datasets
Dataset-JSON correctly represents zero-row datasets with schema-only output:
empty_dm <- data.frame(
STUDYID = character(0L),
AGE = integer(0L),
stringsAsFactors = FALSE
)
json_empty <- tempfile(fileext = ".json")
write_json(empty_dm, json_empty, dataset = "DM")
dm_empty_back <- read_json(json_empty)
nrow(dm_empty_back) # 0
#> [1] 0
ncol(dm_empty_back) # 2
#> [1] 2
names(dm_empty_back) # "STUDYID" "AGE"
#> [1] "STUDYID" "AGE"Before vs After
| Task | Old way | herald |
|---|---|---|
| Write Dataset-JSON | No equivalent | write_json() |
| Read Dataset-JSON | No equivalent | read_json() |
| Convert XPT → JSON | Manual parse + write | xpt_to_json() |
| Convert JSON → XPT | Manual read + write | json_to_xpt() |
| Validate JSON datasets | No equivalent | validate(dir, format = "json") |
Dataset-JSON support is entirely new capability — there is no equivalent in the metacore + xportr + Pinnacle 21 stack.
What to read next
-
vignette("xpt-io")— XPT transport file details -
vignette("metadata-helpers")—apply_spec()for setting labels before write -
vignette("validation")— conformance checking in depth
