# NOT RUN {
## This changed to using https: in June 2015, and that is unsupported.
# u = "http://en.wikipedia.org/wiki/World_population"
u = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
tables = readHTMLTable(u)
names(tables)
tables[[2]]
# Print the table. Note that the values are all characters
# not numbers. Also the column names have a preceding X since
# R doesn't allow the variable names to start with digits.
tmp = tables[[2]]
# Let's just read the second table directly by itself.
doc = htmlParse(u)
tableNodes = getNodeSet(doc, "//table")
tb = readHTMLTable(tableNodes[[2]])
# Let's try to adapt the values on the fly.
# We'll create a function that turns a th/td node into a val
tryAsInteger = function(node) {
val = xmlValue(node)
ans = as.integer(gsub(",", "", val))
if(is.na(ans))
val
else
ans
}
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger)
tb = readHTMLTable(tableNodes[[2]], elFun = tryAsInteger,
colClasses = c("character", rep("integer", 9)))
# }
# NOT RUN {
zz =
readHTMLTable("https://www.inflationdata.com/Inflation/Consumer_Price_Index/HistoricalCPI.aspx")
if(any(i <- sapply(zz, function(x) if(is.null(x)) 0 else ncol(x)) == 14)) {
# guard against the structure of the page changing.
zz = zz[[which(i)[1]]] # 4th table
# convert columns to numeric. Could use colClasses in the call to readHTMLTable()
zz[-1] = lapply(zz[-1], function(x) as.numeric(gsub(".* ", "", as.character(x))))
matplot(1:12, t(zz[-c(1, 14)]), type = "l")
}
# From Marsh Feldman on R-help, possibly
# https://stat.ethz.ch/pipermail/r-help/2010-March/232586.html
# That site was non-responsive in June 2015,
# and this does not do a good job on the current table.
# }
# NOT RUN {
doc <- "http://www.nber.org/cycles/cyclesmain.html"
# The main table is the second one because it's embedded in the page table.
tables <- getNodeSet(htmlParse(doc), "//table")
xt <- readHTMLTable(tables[[2]],
header = c("peak","trough","contraction",
"expansion","trough2trough","peak2peak"),
colClasses = c("character","character","character",
"character","character","character"),
trim = TRUE, stringsAsFactors = FALSE
)
# }
# NOT RUN {
if(FALSE) {
# Here is a totally different way of reading tables from HTML documents.
# The data are formatted using PRE and so can be read via read.table
u = "http://tidesonline.nos.noaa.gov/data_read.shtml?station_info=9414290+San+Francisco,+CA"
h = htmlParse(u)
p = getNodeSet(h, "//pre")
con = textConnection(xmlValue(p[[2]]))
tides = read.table(con)
}
# }
# NOT RUN {
## This is not accessible without authentication ...
u = "http://www.omegahat.net/RCurl/testPassword/table.html"
if(require(RCurl) && url.exists(u)) {
tt = getURL(u, userpwd = "bob:duncantl")
readHTMLTable(tt)
}
# }
Run the code above in your browser using DataLab