# NOT RUN {
# When no spans are present, htmltab produces output close to XML's readHTMLTable(),
but it removes many types of non-data elements (footnotes, non-visible HTML elements, etc)
url <- "http://en.wikipedia.org/wiki/World_population"
xp <- "//caption[starts-with(text(),'World historical')]/ancestor::table"
htmltab(doc = url, which = xp)
popFun <- function(node) {
x <- XML::xmlValue(node)
gsub(',', '', x)
}
htmltab(doc = url, which = xp, bodyFun = popFun)
#This table lacks header information. We provide them through colNames.
#We also need to set header = 0 to indicate that no header is present.
doc <- "http://en.wikipedia.org/wiki/FC_Bayern_Munich"
xp2 <- "//td[text() = 'Head coach']/ancestor::table"
htmltab(doc = doc, which = xp2, header = 0, encoding = "UTF-8", colNames = c("name", "role"))
#htmltab recognizes column spans and produces a one-dimension vector of variable information,
#also removes automatically superscript information since these are usually not of use.
doc <- "http://en.wikipedia.org/wiki/Usage_share_of_web_browsers"
xp3 <- "//table[7]"
bFun <- function(node) {
x <- XML::xmlValue(node)
gsub('%$', '', x)
}
htmltab(doc = doc, which = xp3, bodyFun = bFun)
htmltab("https://en.wikipedia.org/wiki/Arjen_Robben", which = 3,
header = 1:2)
#When header information appear throughout the body, you can specify their
#position in the header formula
htmltab(url, which = "//table[@id='team_gamelogs']", header = . + "//td[./strong]")
# }
Run the code above in your browser using DataLab