Skip to contents

The datatagr package provides tools to help label and validate data. The 'datatagr' class adds column level attributes to a 'data.frame'. Once labelled, variables can be seamlessly used in downstream analyses, making data pipelines more robust and reliable.

Note

The package does not aim to have complete integration with dplyr functions. For example, dplyr::mutate() and dplyr::bind_rows() will not preserve labels. We only provide compatibility for dplyr::rename().

Main functions

  • make_datatagr(): to create datatagr objects from a data.frame or a tibble

  • set_labels(): to change or add labelled variables in a datatagr

  • labels(): to get the list of labels of a datatagr

  • labels_df(): to get a data.frame of all tagged variables

  • lost_labels_action(): to change the behaviour of actions where labelled variables are lost (e.g removing columns storing labelled variables) to issue warnings, errors, or do nothing

  • get_lost_labels_action(): to check the current behaviour of actions where labelled variables are lost

Dedicated methods

Specific methods commonly used to handle data.frame are provided for datatagr objects, typically to help flag or prevent actions which could alter or lose labelled variables (and may thus break downstream data pipelines).

  • names() <- (and related functions, such as dplyr::rename()) will rename labels as needed

  • x[...] <- and x[[...]] <- (see sub_datatagr): will adopt the desired behaviour when labelled variables are lost

  • print(): prints info about the datatagr in addition to the data.frame or tibble

Author

Maintainer: Chris Hartgerink chris@data.org (ORCID)

Other contributors:

Examples


# using base R style
x <- make_datatagr(cars[1:50, ],
  speed = "Miles per hour",
  dist = "Distance in miles"
)
x
#> 
#> // datatagr object
#>    speed dist
#> 1      4    2
#> 2      4   10
#> 3      7    4
#> 4      7   22
#> 5      8   16
#> 6      9   10
#> 7     10   18
#> 8     10   26
#> 9     10   34
#> 10    11   17
#> 11    11   28
#> 12    12   14
#> 13    12   20
#> 14    12   24
#> 15    12   28
#> 16    13   26
#> 17    13   34
#> 18    13   34
#> 19    13   46
#> 20    14   26
#> 21    14   36
#> 22    14   60
#> 23    14   80
#> 24    15   20
#> 25    15   26
#> 26    15   54
#> 27    16   32
#> 28    16   40
#> 29    17   32
#> 30    17   40
#> 31    17   50
#> 32    18   42
#> 33    18   56
#> 34    18   76
#> 35    18   84
#> 36    19   36
#> 37    19   46
#> 38    19   68
#> 39    20   32
#> 40    20   48
#> 41    20   52
#> 42    20   56
#> 43    20   64
#> 44    22   66
#> 45    23   54
#> 46    24   70
#> 47    24   92
#> 48    24   93
#> 49    24  120
#> 50    25   85
#> 
#> labelled variables:
#>  speed - Miles per hour
#>  dist - Distance in miles 

## check labelled variables
labels(x)
#> $speed
#> [1] "Miles per hour"
#> 
#> $dist
#> [1] "Distance in miles"
#> 

## robust renaming
names(x)[1] <- "identifier"
x
#> 
#> // datatagr object
#>    identifier dist
#> 1           4    2
#> 2           4   10
#> 3           7    4
#> 4           7   22
#> 5           8   16
#> 6           9   10
#> 7          10   18
#> 8          10   26
#> 9          10   34
#> 10         11   17
#> 11         11   28
#> 12         12   14
#> 13         12   20
#> 14         12   24
#> 15         12   28
#> 16         13   26
#> 17         13   34
#> 18         13   34
#> 19         13   46
#> 20         14   26
#> 21         14   36
#> 22         14   60
#> 23         14   80
#> 24         15   20
#> 25         15   26
#> 26         15   54
#> 27         16   32
#> 28         16   40
#> 29         17   32
#> 30         17   40
#> 31         17   50
#> 32         18   42
#> 33         18   56
#> 34         18   76
#> 35         18   84
#> 36         19   36
#> 37         19   46
#> 38         19   68
#> 39         20   32
#> 40         20   48
#> 41         20   52
#> 42         20   56
#> 43         20   64
#> 44         22   66
#> 45         23   54
#> 46         24   70
#> 47         24   92
#> 48         24   93
#> 49         24  120
#> 50         25   85
#> 
#> labelled variables:
#>  identifier - Miles per hour
#>  dist - Distance in miles 

## example of dropping labels by mistake - default: warning
x[, 2]
#> Warning: The following labelled variables are lost:
#>  identifier - Miles per hour
#> 
#> // datatagr object
#>    dist
#> 1     2
#> 2    10
#> 3     4
#> 4    22
#> 5    16
#> 6    10
#> 7    18
#> 8    26
#> 9    34
#> 10   17
#> 11   28
#> 12   14
#> 13   20
#> 14   24
#> 15   28
#> 16   26
#> 17   34
#> 18   34
#> 19   46
#> 20   26
#> 21   36
#> 22   60
#> 23   80
#> 24   20
#> 25   26
#> 26   54
#> 27   32
#> 28   40
#> 29   32
#> 30   40
#> 31   50
#> 32   42
#> 33   56
#> 34   76
#> 35   84
#> 36   36
#> 37   46
#> 38   68
#> 39   32
#> 40   48
#> 41   52
#> 42   56
#> 43   64
#> 44   66
#> 45   54
#> 46   70
#> 47   92
#> 48   93
#> 49  120
#> 50   85
#> 
#> labelled variables:
#>  dist - Distance in miles 

## to silence warnings when labels are dropped
lost_labels_action("none")
#> Lost labels will now be ignored.
x[, 2]
#> 
#> // datatagr object
#>    dist
#> 1     2
#> 2    10
#> 3     4
#> 4    22
#> 5    16
#> 6    10
#> 7    18
#> 8    26
#> 9    34
#> 10   17
#> 11   28
#> 12   14
#> 13   20
#> 14   24
#> 15   28
#> 16   26
#> 17   34
#> 18   34
#> 19   46
#> 20   26
#> 21   36
#> 22   60
#> 23   80
#> 24   20
#> 25   26
#> 26   54
#> 27   32
#> 28   40
#> 29   32
#> 30   40
#> 31   50
#> 32   42
#> 33   56
#> 34   76
#> 35   84
#> 36   36
#> 37   46
#> 38   68
#> 39   32
#> 40   48
#> 41   52
#> 42   56
#> 43   64
#> 44   66
#> 45   54
#> 46   70
#> 47   92
#> 48   93
#> 49  120
#> 50   85
#> 
#> labelled variables:
#>  dist - Distance in miles 

## to trigger errors when labels are dropped
# lost_labels_action("error")
# x[, 2:5]

## reset default behaviour
lost_labels_action()
#> Lost labels will now issue a warning.

# using tidyverse style

## example of creating a datatagr, adding a new variable, and adding a label
## for it

if (require(dplyr) && require(magrittr)) {
  x <- cars %>%
    tibble() %>%
    make_datatagr(
      speed = "Miles per hour",
      dist = "Distance in miles"
    ) %>%
    mutate(result = if_else(speed > 50, "fast", "slow")) %>%
    set_labels(result = "Ticket yes/no")

  head(x)

  ## extract labelled variables
  x %>%
    select(has_label(c("Ticket yes/no")))

  ## Retrieve all labels
  x %>%
    labels()

  ## Select based on variable name
  x %>%
    select(starts_with("speed"))
}
#> Loading required package: dplyr
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
#> Loading required package: magrittr
#> Warning: The following labelled variables are lost:
#>  speed - Miles per hour
#>  dist - Distance in miles
#> Warning: The following labelled variables are lost:
#>  dist - Distance in miles
#>  result - Ticket yes/no
#> 
#> // datatagr object
#> # A tibble: 50 × 1
#>    speed
#>    <dbl>
#>  1     4
#>  2     4
#>  3     7
#>  4     7
#>  5     8
#>  6     9
#>  7    10
#>  8    10
#>  9    10
#> 10    11
#> # ℹ 40 more rows
#> 
#> labelled variables:
#>  speed - Miles per hour