In this vignette I show comparisons between namedCapture::df_match_variable and its closest cousin in the R package universe, tidyr::extract. The two packages can be used to compute the same result, but the code/syntax is different.

Longer more readable syntax

In this first comparison we use a syntax with each group name on the same line as its pattern. Here are some observations from the comparison:

## First define data.
(sacct.df <- data.frame(
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrM:111,000-222,000",
    "this will not match",
    NA, # neither will this.
    "chr1:110-111 chr2:220-222"), # two possible matches.
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  stringsAsFactors=FALSE))
#>                        position              JobID
#> 1 chr10:213,054,000-213,055,000        13937810_25
#> 2          chrM:111,000-222,000  13937810_25.batch
#> 3           this will not match 13937810_25.extern
#> 4                          <NA>     14022192_[1-3]
#> 5     chr1:110-111 chr2:220-222       14022204_[4]
remove.commas <- function(x)gsub(",", "", x)
long.list <- list()

## namedCapture: 29 lines of code.
range.list <- list(
  "\\[",
  task1="[0-9]+", as.integer,
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", as.integer,
  ")?", #end is optional.
  "\\]")
task.list <- list(
  "(?:",#begin alternate
  task="[0-9]+", as.integer,
  "|",#either one task(above) or range(below)
  range.list,
  ")")#end alternate
to.int <- function(x)as.integer(remove.commas(x))
(long.list$namedCapture <- namedCapture::df_match_variable(
  sacct.df,
  JobID=list(
    job="[0-9]+", as.integer,
    "_",
    task.list,
    "(?:[.]",
    type=".*",
    ")?"),
  position=list(
    chrom="chr.*?",
    ":",
    chromStart=".*?", to.int,
    "-",
    chromEnd="[0-9,]*", to.int)))
#>                        position              JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000        13937810_25  13937810         25
#> 2          chrM:111,000-222,000  13937810_25.batch  13937810         25
#> 3           this will not match 13937810_25.extern  13937810         25
#> 4                          <NA>     14022192_[1-3]  14022192         NA
#> 5     chr1:110-111 chr2:220-222       14022204_[4]  14022204         NA
#>   JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1          NA          NA                     chr10           213054000
#> 2          NA          NA      batch           chrM              111000
#> 3          NA          NA     extern           <NA>                  NA
#> 4           1           3                      <NA>                  NA
#> 5           4          NA                      chr1                 110
#>   position.chromEnd
#> 1         213055000
#> 2            222000
#> 3                NA
#> 4                NA
#> 5               111

## tidyr: 46 lines of code.
range.vec <- c(
  "\\[",
  task1="[0-9]+", 
  "(?:-",#begin optional end of range.
  taskN="[0-9]+", 
  ")?", #end is optional.
  "\\]")
task.vec <- c(
  "(?:",#begin alternate
  task="[0-9]+", 
  "|",#either one task(above) or range(below)
  range.vec,
  ")")#end alternate
regex.list <- list(
  JobID=c(
    job="[0-9]+", 
    "_",
    task.vec,
    "(?:[.]",
    type=".*",
    ")?"),
  position=c(
    chrom="chr.*?",
    ":",
    chromStart=".*?",
    "-",
    chromEnd="[0-9,]*"))
tidyr.input <- transform(
  sacct.df,
  position=remove.commas(position))
tidyr.df.list <- list(sacct.df)
for(col.name in names(regex.list)){
  regex.vec <- regex.list[[col.name]]
  is.group <- names(regex.vec)!=""
  format.vec <- ifelse(is.group, "(%s)", "%s")
  group.vec <- sprintf(format.vec, regex.vec)
  regex <- paste(group.vec, collapse="")
  group.names <- names(regex.vec)[is.group]
  result <- tidyr::extract(
    tidyr.input, col.name, group.names, regex, convert=TRUE)
  to.save <- result[, group.names, drop=FALSE]
  names(to.save) <- paste0(col.name, ".", group.names)
  tidyr.df.list[[col.name]] <- to.save
}
names(tidyr.df.list) <- NULL
long.list$tidyr <- do.call(cbind, tidyr.df.list)

## Make sure the results are the same.
t(sapply(long.list, names))
#>              [,1]       [,2]    [,3]        [,4]         [,5]         
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> tidyr        "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#>              [,6]          [,7]         [,8]            
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom"
#> tidyr        "JobID.taskN" "JobID.type" "position.chrom"
#>              [,9]                  [,10]              
#> namedCapture "position.chromStart" "position.chromEnd"
#> tidyr        "position.chromStart" "position.chromEnd"
t(sapply(long.list, sapply, class))
#>              position    JobID       JobID.job JobID.task JobID.task1
#> namedCapture "character" "character" "integer" "integer"  "integer"  
#> tidyr        "character" "character" "integer" "integer"  "integer"  
#>              JobID.taskN JobID.type  position.chrom position.chromStart
#> namedCapture "integer"   "character" "character"    "integer"          
#> tidyr        "integer"   "character" "character"    "integer"          
#>              position.chromEnd
#> namedCapture "integer"        
#> tidyr        "integer"
long.list$tidyr$JobID.type <- ifelse(
  is.na(long.list$tidyr$JobID.type),
  "",
  long.list$tidyr$JobID.type)
with(long.list, identical(tidyr, namedCapture))
#> [1] TRUE

Shorter less readable syntax

This second comparison uses a syntax with the entire regex on one line. In my opinion this syntax makes the regular expressions more difficult to read/understand. Complicated regular expressions like the one used for matching the JobID column are not maintainable/understandable at all using this syntax.

## First define data.
(sacct.df <- data.frame(
  position=c(
    "chr10:213,054,000-213,055,000",
    "chrM:111,000-222,000",
    "this will not match",
    NA, # neither will this.
    "chr1:110-111 chr2:220-222"), # two possible matches.
  JobID=c(
    "13937810_25",
    "13937810_25.batch",
    "13937810_25.extern",
    "14022192_[1-3]",
    "14022204_[4]"),
  stringsAsFactors=FALSE))
#>                        position              JobID
#> 1 chr10:213,054,000-213,055,000        13937810_25
#> 2          chrM:111,000-222,000  13937810_25.batch
#> 3           this will not match 13937810_25.extern
#> 4                          <NA>     14022192_[1-3]
#> 5     chr1:110-111 chr2:220-222       14022204_[4]
short.list <- list()

## tidyr alternate (13 lines total)
e <- function(col.name, group.names, pattern){
  result <- tidyr::extract(
    sacct.df, col.name, group.names, pattern, convert=TRUE)
  to.save <- result[, group.names, drop=FALSE]
  names(to.save) <- paste0(col.name, ".", group.names)
  to.save
}
short.list$tidyr <- do.call(cbind, list(
  sacct.df,
  e("JobID", c("job", "task", "task1", "taskN", "type"),
    "([0-9]+)_(?:([0-9]+)|\\[([0-9]+)(?:-([0-9]+))?\\])(?:[.](.*))?"),
  e("position", c("chrom", "chromStart", "chromEnd"),
    "(chr.*?):(.*?)-([0-9,]*)")))

## namedCapture alternate (7 lines total)
(short.list$namedCapture <- namedCapture::df_match_variable(
  sacct.df,
  JobID="(?P<job>[0-9]+)_(?:(?P<task>[0-9]+)|\\[(?P<task1>[0-9]+)(?:-(?P<taskN>[0-9]+))?\\])(?:[.](?P<type>.*))?",
  position="(?P<chrom>chr.*?):(?P<chromStart>.*?)-(?P<chromEnd>[0-9,]*)"))
#>                        position              JobID JobID.job JobID.task
#> 1 chr10:213,054,000-213,055,000        13937810_25  13937810         25
#> 2          chrM:111,000-222,000  13937810_25.batch  13937810         25
#> 3           this will not match 13937810_25.extern  13937810         25
#> 4                          <NA>     14022192_[1-3]  14022192           
#> 5     chr1:110-111 chr2:220-222       14022204_[4]  14022204           
#>   JobID.task1 JobID.taskN JobID.type position.chrom position.chromStart
#> 1                                             chr10         213,054,000
#> 2                              batch           chrM             111,000
#> 3                             extern           <NA>                <NA>
#> 4           1           3                      <NA>                <NA>
#> 5           4                                  chr1                 110
#>   position.chromEnd
#> 1       213,055,000
#> 2           222,000
#> 3              <NA>
#> 4              <NA>
#> 5               111
for(N in names(short.list$namedCapture)){
  short.list$namedCapture[[N]] <- type.convert(short.list$namedCapture[[N]], as.is=TRUE)
}

## Make sure the results are the same.
t(sapply(short.list, names))
#>              [,1]       [,2]    [,3]        [,4]         [,5]         
#> tidyr        "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#> namedCapture "position" "JobID" "JobID.job" "JobID.task" "JobID.task1"
#>              [,6]          [,7]         [,8]            
#> tidyr        "JobID.taskN" "JobID.type" "position.chrom"
#> namedCapture "JobID.taskN" "JobID.type" "position.chrom"
#>              [,9]                  [,10]              
#> tidyr        "position.chromStart" "position.chromEnd"
#> namedCapture "position.chromStart" "position.chromEnd"
t(sapply(short.list, sapply, class))
#>              position    JobID       JobID.job JobID.task JobID.task1
#> tidyr        "character" "character" "integer" "integer"  "integer"  
#> namedCapture "character" "character" "integer" "integer"  "integer"  
#>              JobID.taskN JobID.type  position.chrom position.chromStart
#> tidyr        "integer"   "character" "character"    "character"        
#> namedCapture "integer"   "character" "character"    "character"        
#>              position.chromEnd
#> tidyr        "character"      
#> namedCapture "character"
short.list$tidyr$JobID.type <- ifelse(
  is.na(short.list$tidyr$JobID.type),
  "",
  short.list$tidyr$JobID.type)
with(short.list, identical(tidyr, namedCapture))
#> [1] TRUE