# NOT RUN {
chr.pos.vec <- c(
"chr10:213,054,000-213,055,000",
"chrM:111,000-222,000",
"this will not match",
NA, # neither will this.
"chr1:110-111 chr2:220-222") # two possible matches.
keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x))
## By default elements of subject are treated as separate lines (and
## NAs are removed). Named arguments are used to create capture
## groups, and conversion functions such as keep.digits are used to
## convert the previously named group.
int.pattern <- list("[0-9,]+", keep.digits)
(match.dt <- nc::capture_all_str(
chr.pos.vec,
chrom="chr.*?",
":",
chromStart=int.pattern,
"-",
chromEnd=int.pattern))
str(match.dt)
## use engine="ICU" for unicode character classes
## http://userguide.icu-project.org/strings/regexp e.g. match any
## character with a numeric value of 2 (including japanese etc).
nc::capture_all_str(
"\u4e8c \u4e09 2 3 ",
two="[\\p{numeric_value=2}]",
engine="ICU")
## Extract all fields from each alignment block, using two regex
## patterns, then dcast.
library(data.table)
info.txt.gz <- system.file(
"extdata", "SweeD_Info.txt.gz", package="nc")
info.vec <- readLines(info.txt.gz)
info.vec[24:40]
info.dt <- nc::capture_all_str(
sub("Alignment ", "//", info.vec),
"//",
alignment="[0-9]+",
fields="[^/]+")
(fields.dt <- info.dt[, nc::capture_all_str(
fields,
"\t+",
variable="[^:]+",
":\t*",
value=".*"),
by=alignment])
(fields.wide <- dcast(fields.dt, alignment ~ variable))
## Capture all csv tables in report.
report.txt.gz <- system.file(
"extdata", "SweeD_Report.txt.gz", package="nc")
report.vec <- readLines(report.txt.gz)
(report.dt <- nc::capture_all_str(
report.vec,
"//",
alignment="[0-9]+",
"\n",
csv="[^/]+"
)[, {
fread(text=csv)
}, by=alignment])
## Join report with info fields.
report.dt[fields.wide, on=.(alignment)]
# }
Run the code above in your browser using DataLab