Dataset-JSON v1.1

CDISC Dataset-JSON v1.1 is the modern, text-based alternative to XPT transport files. It stores datasets as structured JSON with embedded column metadata, making it diff-friendly, human-readable, and natively supported by the CDISC Dataset-JSON viewer. herald implements the full v1.1 specification.

Writing Dataset-JSON

skip <- !requireNamespace("jsonlite", quietly = TRUE)

library(herald)

dm <- data.frame(
  STUDYID = rep("CDISCPILOT01", 3L),
  USUBJID = c("01-701-1015", "01-701-1023", "01-701-1028"),
  AGE     = c(63L, 64L, 71L),
  SEX     = c("F", "M", "M"),
  stringsAsFactors = FALSE
)

# Attach labels so they appear in the JSON columns metadata
attr(dm$STUDYID, "label") <- "Study Identifier"
attr(dm$USUBJID, "label") <- "Unique Subject Identifier"
attr(dm$AGE,     "label") <- "Age"
attr(dm$SEX,     "label") <- "Sex"

json_path <- tempfile(fileext = ".json")

write_json(dm, json_path, dataset = "DM", label = "Demographics")

The resulting file is valid CDISC Dataset-JSON v1.1 with a flat structure: datasetJSONVersion, name, label, records, columns, and rows at the top level. There is no clinicalData nesting (that was v1.0).

raw <- jsonlite::fromJSON(json_path, simplifyVector = FALSE)
raw$datasetJSONVersion
#> [1] "1.1.0"
raw$name
#> [1] "DM"
raw$label
#> [1] "Demographics"
raw$records
#> [1] 3
length(raw$columns)  # one entry per variable
#> [1] 4
raw$columns[[3]]$name
#> [1] "AGE"
raw$columns[[3]]$dataType
#> [1] "integer"

write_json() returns the input data frame invisibly, enabling pipes.

Reading Dataset-JSON

dm2 <- read_json(json_path)

nrow(dm2)
#> [1] 3
names(dm2)
#> [1] "STUDYID" "USUBJID" "AGE"     "SEX"

# Metadata attributes are preserved
attr(dm2,         "label")        # "Demographics"
#> [1] "Demographics"
attr(dm2,         "dataset_name") # "DM"
#> [1] "DM"
attr(dm2$STUDYID, "label")        # "Study Identifier"
#> [1] "Study Identifier"
attr(dm2$AGE,     "label")        # "Age"
#> [1] "Age"

The dataset_name attribute holds the name field from the JSON (useful when reading a file without knowing the dataset name in advance).

Handling NA values

ae <- data.frame(
  STUDYID = "CDISCPILOT01",
  USUBJID = "01-701-1015",
  AETERM  = c("HEADACHE", NA_character_, "NAUSEA"),
  AESEQ   = c(1L, NA_integer_, 3L),
  stringsAsFactors = FALSE
)

json_ae <- tempfile(fileext = ".json")

write_json(ae, json_ae, dataset = "AE")
ae2 <- read_json(json_ae)

is.na(ae2$AETERM[2])  # TRUE — NA preserved
#> [1] TRUE
is.na(ae2$AESEQ[2])   # TRUE
#> [1] TRUE

JSON null values round-trip to R NA.

Converting between XPT and Dataset-JSON

XPT → Dataset-JSON

xpt_path <- file.path(tempdir(), "dm.xpt")

# Write XPT (with labels attached from the write_json example above)
write_xpt(dm, xpt_path, label = "Demographics")

json_from_xpt <- tempfile(fileext = ".json")

xpt_to_json(xpt_path, json_from_xpt)

dm_from_json <- read_json(json_from_xpt)
identical(dm_from_json$STUDYID, dm$STUDYID)
#> [1] FALSE

Dataset-JSON → XPT

xpt_from_json <- tempfile(fileext = ".xpt")

json_to_xpt(json_path, xpt_from_json)

dm_from_xpt <- read_xpt(xpt_from_json)
identical(dm_from_xpt$STUDYID, dm$STUDYID)
#> [1] TRUE

Full round-trip: XPT → JSON → XPT

xpt1 <- file.path(tempdir(), "dm_rt1.xpt")
json  <- tempfile(fileext = ".json")
xpt2  <- file.path(tempdir(), "dm_rt2.xpt")

write_xpt(dm, xpt1)
xpt_to_json(xpt1, json)
json_to_xpt(json, xpt2)

dm_rt <- read_xpt(xpt2)

identical(dm_rt$STUDYID, dm$STUDYID)
#> [1] TRUE
identical(dm_rt$AGE,     dm$AGE)
#> [1] FALSE
identical(dm_rt$SEX,     dm$SEX)
#> [1] TRUE

Validation with Dataset-JSON

validate() accepts JSON input with format = "json". The same spec-driven conformance engine runs regardless of input format:

vspec <- herald_spec(
  ds_spec  = data.frame(dataset = "DM", label = "Demographics",
                        stringsAsFactors = FALSE),
  var_spec = data.frame(
    dataset   = rep("DM", 4L),
    variable  = c("STUDYID","USUBJID","AGE","SEX"),
    label     = c("Study Identifier","Unique Subject Identifier","Age","Sex"),
    data_type = c("text","text","integer","text"),
    length    = c(12L,11L,8L,1L),
    stringsAsFactors = FALSE
  )
)

vjson_dir <- tempfile()
dir.create(vjson_dir)

write_json(dm, file.path(vjson_dir, "dm.json"), dataset = "DM",
           label = "Demographics")

result <- validate(vjson_dir, spec = vspec, format = "json", rules = NULL)
result$summary
#> $reject
#> [1] 0
#> 
#> $high
#> [1] 0
#> 
#> $medium
#> [1] 0
#> 
#> $low
#> [1] 0
#> 
#> $total
#> [1] 0

Empty datasets

Dataset-JSON correctly represents zero-row datasets with schema-only output:

empty_dm <- data.frame(
  STUDYID = character(0L),
  AGE     = integer(0L),
  stringsAsFactors = FALSE
)

json_empty <- tempfile(fileext = ".json")

write_json(empty_dm, json_empty, dataset = "DM")
dm_empty_back <- read_json(json_empty)

nrow(dm_empty_back)   # 0
#> [1] 0
ncol(dm_empty_back)   # 2
#> [1] 2
names(dm_empty_back)  # "STUDYID" "AGE"
#> [1] "STUDYID" "AGE"

Before vs After

Task	Old way	herald
Write Dataset-JSON	No equivalent	`write_json()`
Read Dataset-JSON	No equivalent	`read_json()`
Convert XPT → JSON	Manual parse + write	`xpt_to_json()`
Convert JSON → XPT	Manual read + write	`json_to_xpt()`
Validate JSON datasets	No equivalent	`validate(dir, format = "json")`

Dataset-JSON support is entirely new capability — there is no equivalent in the metacore + xportr + Pinnacle 21 stack.