Skip to contents

The safeframe package provides tools to help tag and validate data. The 'safeframe' class adds column level attributes to a 'data.frame'. Once tagged, variables can be seamlessly used in downstream analyses, making data pipelines more robust and reliable.

Note

The package does not aim to have complete integration with dplyr functions. For example, dplyr::mutate() and dplyr::bind_rows() will not preserve tags in all cases. We only provide compatibility for dplyr::rename().

Main functions

  • make_safeframe(): to create safeframe objects from a data.frame or a tibble

  • set_tags(): to change or add tagged variables in a safeframe

  • tags(): to get the list of tags of a safeframe

  • tags_df(): to get a data.frame of all tagged variables

  • lost_tags_action(): to change the behaviour of actions where tagged variables are lost (e.g removing columns storing tagged variables) to issue warnings, errors, or do nothing

  • get_lost_tags_action(): to check the current behaviour of actions where tagged variables are lost

Dedicated methods

Specific methods commonly used to handle data.frame are provided for safeframe objects, typically to help flag or prevent actions which could alter or lose tagged variables (and may thus break downstream data pipelines).

  • names() <- (and related functions, such as dplyr::rename()) will rename variables and carry forward the existing tags

  • x[...] <- and x[[...]] <- (see sub_safeframe): will adopt the desired behaviour when tagged variables are lost

  • print(): prints info about the safeframe in addition to the data.frame or tibble

Author

Maintainer: Chris Hartgerink chris@data.org (ORCID)

Other contributors:

Examples


# using base R style
x <- make_safeframe(cars[1:50, ],
  mph = "speed",
  distance = "dist"
)
x
#> 
#> // safeframe object
#>    speed dist
#> 1      4    2
#> 2      4   10
#> 3      7    4
#> 4      7   22
#> 5      8   16
#> 6      9   10
#> 7     10   18
#> 8     10   26
#> 9     10   34
#> 10    11   17
#> 11    11   28
#> 12    12   14
#> 13    12   20
#> 14    12   24
#> 15    12   28
#> 16    13   26
#> 17    13   34
#> 18    13   34
#> 19    13   46
#> 20    14   26
#> 21    14   36
#> 22    14   60
#> 23    14   80
#> 24    15   20
#> 25    15   26
#> 26    15   54
#> 27    16   32
#> 28    16   40
#> 29    17   32
#> 30    17   40
#> 31    17   50
#> 32    18   42
#> 33    18   56
#> 34    18   76
#> 35    18   84
#> 36    19   36
#> 37    19   46
#> 38    19   68
#> 39    20   32
#> 40    20   48
#> 41    20   52
#> 42    20   56
#> 43    20   64
#> 44    22   66
#> 45    23   54
#> 46    24   70
#> 47    24   92
#> 48    24   93
#> 49    24  120
#> 50    25   85
#> 
#> tagged variables:
#>  mph - speed
#>  distance - dist 

## check tagged variables
tags(x)
#> $mph
#> [1] "speed"
#> 
#> $distance
#> [1] "dist"
#> 

## robust renaming
names(x)[1] <- "identifier"
x
#> 
#> // safeframe object
#>    identifier dist
#> 1           4    2
#> 2           4   10
#> 3           7    4
#> 4           7   22
#> 5           8   16
#> 6           9   10
#> 7          10   18
#> 8          10   26
#> 9          10   34
#> 10         11   17
#> 11         11   28
#> 12         12   14
#> 13         12   20
#> 14         12   24
#> 15         12   28
#> 16         13   26
#> 17         13   34
#> 18         13   34
#> 19         13   46
#> 20         14   26
#> 21         14   36
#> 22         14   60
#> 23         14   80
#> 24         15   20
#> 25         15   26
#> 26         15   54
#> 27         16   32
#> 28         16   40
#> 29         17   32
#> 30         17   40
#> 31         17   50
#> 32         18   42
#> 33         18   56
#> 34         18   76
#> 35         18   84
#> 36         19   36
#> 37         19   46
#> 38         19   68
#> 39         20   32
#> 40         20   48
#> 41         20   52
#> 42         20   56
#> 43         20   64
#> 44         22   66
#> 45         23   54
#> 46         24   70
#> 47         24   92
#> 48         24   93
#> 49         24  120
#> 50         25   85
#> 
#> tagged variables:
#>  mph - identifier
#>  distance - dist 

## example of dropping tags by mistake - default: warning
x[, 2]
#> Warning: The following tagged variables are lost:
#>  identifier - mph
#> 
#> // safeframe object
#>    dist
#> 1     2
#> 2    10
#> 3     4
#> 4    22
#> 5    16
#> 6    10
#> 7    18
#> 8    26
#> 9    34
#> 10   17
#> 11   28
#> 12   14
#> 13   20
#> 14   24
#> 15   28
#> 16   26
#> 17   34
#> 18   34
#> 19   46
#> 20   26
#> 21   36
#> 22   60
#> 23   80
#> 24   20
#> 25   26
#> 26   54
#> 27   32
#> 28   40
#> 29   32
#> 30   40
#> 31   50
#> 32   42
#> 33   56
#> 34   76
#> 35   84
#> 36   36
#> 37   46
#> 38   68
#> 39   32
#> 40   48
#> 41   52
#> 42   56
#> 43   64
#> 44   66
#> 45   54
#> 46   70
#> 47   92
#> 48   93
#> 49  120
#> 50   85
#> 
#> tagged variables:
#>  distance - dist 

## to silence warnings when tags are dropped
lost_tags_action("none")
#> Lost tags will now be ignored.
x[, 2]
#> 
#> // safeframe object
#>    dist
#> 1     2
#> 2    10
#> 3     4
#> 4    22
#> 5    16
#> 6    10
#> 7    18
#> 8    26
#> 9    34
#> 10   17
#> 11   28
#> 12   14
#> 13   20
#> 14   24
#> 15   28
#> 16   26
#> 17   34
#> 18   34
#> 19   46
#> 20   26
#> 21   36
#> 22   60
#> 23   80
#> 24   20
#> 25   26
#> 26   54
#> 27   32
#> 28   40
#> 29   32
#> 30   40
#> 31   50
#> 32   42
#> 33   56
#> 34   76
#> 35   84
#> 36   36
#> 37   46
#> 38   68
#> 39   32
#> 40   48
#> 41   52
#> 42   56
#> 43   64
#> 44   66
#> 45   54
#> 46   70
#> 47   92
#> 48   93
#> 49  120
#> 50   85
#> 
#> tagged variables:
#>  distance - dist 

## to trigger errors when tags are dropped
# lost_tags_action("error")
# x[, 1]

## reset default behaviour
lost_tags_action()
#> Lost tags will now issue a warning.

# using tidyverse style

## example of creating a safeframe, adding a new variable, and adding a tag
## for it

if (require(dplyr) && require(magrittr)) {
  x <- cars %>%
    tibble() %>%
    make_safeframe(
      mph = "speed",
      distance = "dist"
    ) %>%
    mutate(result = if_else(speed > 50, "fast", "slow")) %>%
    set_tags(ticket = "result")

  head(x)

  ## extract tagged variables
  x %>%
    select(has_tag(c("ticket")))

  ## Retrieve all tags
  x %>%
    tags()

  ## Select based on variable name
  x %>%
    select(starts_with("speed"))
}
#> Warning: The following tagged variables are lost:
#>  speed - mph
#>  dist - distance
#> Warning: The following tagged variables are lost:
#>  dist - distance
#>  result - ticket
#> 
#> // safeframe object
#> # A tibble: 50 × 1
#>    speed
#>    <dbl>
#>  1     4
#>  2     4
#>  3     7
#>  4     7
#>  5     8
#>  6     9
#>  7    10
#>  8    10
#>  9    10
#> 10    11
#> # ℹ 40 more rows
#> 
#> tagged variables:
#>  mph - speed