# NOT RUN {
#### Extract title, publishing date and article from the web page using css selectors
#
DATA<-ContentScraper(Url="http://glofile.com/index.php/2017/06/08/taux-nette-detente/",
CssPatterns = c(".entry-title",".published",".entry-content"), astext = TRUE)
#### The web page source can be provided also in HTML text (characters)
#
txthml<-"<html><title>blah</title><div><p>I m the content</p></div></html>"
DATA<-ContentScraper(HTmlText = txthml ,XpathPatterns = "//*/p")
#### Extract post title and bodt from the web page using Xpath patterns,
# PatternsName can be provided as indication.
#
DATA<-ContentScraper(Url ="http://glofile.com/index.php/2017/06/08/athletisme-m-a-rome/",
XpathPatterns=c("//head/title","//*/article"),PatternsName=c("title", "article"))
#### Extract titles and contents of 3 Urls using CSS selectors, As result DATA variable
# will handle 6 elements.
#
urllist<-c("http://glofile.com/index.php/2017/06/08/sondage-quel-budget/",
"http://glofile.com/index.php/2017/06/08/cyril-hanouna-tire-a-boulets-rouges-sur-le-csa/",
"http://glofile.com/index.php/2017/06/08/placements-quelles-solutions-pour-doper/",
"http://glofile.com/index.php/2017/06/08/paris-un-concentre-de-suspens/")
DATA<-ContentScraper(Url =urllist, CssPatterns = c(".entry-title",".entry-content"),
PatternsName = c("title","content"))
#### Extract post title and list of comments from a set of blog pages,
# ManyPerPattern argument enables extracting many elements having same pattern from each
# page like comments, reviews, quotes and listing.
DATA<-ContentScraper(Url =urllist, CssPatterns = c(".entry-title",".comment-content p"),
PatternsName = c("title","comments"), astext = TRUE, ManyPerPattern = TRUE)
#### From this Forum page e extract the post title and all replies using CSS selectors
# c("head > title",".post"), However, we know that each reply contain previous Replys
# as quote so we need to exclude To remove inner quotes in each reply we use
# ExcludeCSSPat c(".quote",".quoteheader a")
DATA<-ContentScraper(Url = "https://bitcointalk.org/index.php?topic=2334331.0",
CssPatterns = c("head > title",".post"), ExcludeCSSPat = c(".quote",".quoteheader"),
PatternsName = c("Title","Replys"), ManyPerPattern = TRUE)
#### Scrape data from web page requiring authentification
# replace \@ by @ before running follwing examples
# create a loggedin session
LS<-run_browser()
LS<-LoginSession(Browser = LS, LoginURL = 'https://manager.submittable.com/login',
LoginCredentials = c('your email','your password'),
cssLoginFields =c('#email', '#password'),
XpathLoginButton ='//*[\@type=\"submit\"]' )
#Then scrape data with the session
DATA<-ContentScraper(Url='https://manager.submittable.com/beta/discover/119087',
XpathPatterns = c('//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[3]',
'//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[2]/div[1]/div[1]' ),
PatternsName = c("Article","Title"), astext = TRUE, browser = LS )
#OR
page<-LinkExtractor(url='https://manager.submittable.com/beta/discover/119087',
browser = LS)
DATA<-ContentScraper(HTmlText = page$Info$Source_page,
XpathPatterns = c("//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[3]",
"//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[2]/div[1]/div[1]" ),
PatternsName = c("Article","Title"),astext = TRUE )
To get all first elements of the lists in one vector (example all titles) :
VecTitle<-unlist(lapply(DATA, `[[`, 1))
To get all second elements of the lists in one vector (example all articles)
VecContent<-unlist(lapply(DATA, `[[`, 2))
# }
Run the code above in your browser using DataLab