# To understand the difference between html_text() and html_text2()
# take the following html:
html <- minimal_html(
"This is a paragraph.
This another sentence.This should start on a new line"
)
# html_text() returns the raw underlying text, which includes whitespace
# that would be ignored by a browser, and ignores the
html %>% html_element("p") %>% html_text() %>% writeLines()
# html_text2() simulates what a browser would display. Non-significant
# whitespace is collapsed, and is turned into a line break
html %>% html_element("p") %>% html_text2() %>% writeLines()
# By default, html_text2() also converts non-breaking spaces to regular
# spaces:
html <- minimal_html("x y")
x1 <- html %>% html_element("p") %>% html_text()
x2 <- html %>% html_element("p") %>% html_text2()
# When printed, non-breaking spaces look exactly like regular spaces
x1
x2
# But aren't actually the same:
x1 == x2
# Which you can confirm by looking at their underlying binary
# representaion:
charToRaw(x1)
charToRaw(x2)
Run the code above in your browser using DataLab