# NOT RUN {
# }
# NOT RUN {
######### Crawl, index, and store all pages of a websites using 4 cores and 4 parallel requests
#
Rcrawler(Website ="http://glofile.com/", no_cores = 4, no_conn = 4)
######### Crawl and index the website using 8 cores and 8 parallel requests with respect to
# robot.txt rules using Mozilla string in user agent.
Rcrawler(Website = "http://www.example.com/", no_cores=8, no_conn=8, Obeyrobots = TRUE,
Useragent="Mozilla 3.11")
######### Crawl the website using the default configuration and scrape specific data from
# the website, in this case we need all posts (articles and titles) matching two XPath patterns.
# we know that all blog posts have datesin their URLs like 2017/09/08 so to avoid
# collecting category or other pages we can tell the crawler that desired page's URLs
# are like 4-digit/2-digit/2-digit/ using regular expression.
# Note thatyou can use the excludepattern parameter to exclude a node from being
# extracted, e.g., in the case that a desired node includes (is a parent of) an
# undesired "child" node. (article having inner ads or menu)
Rcrawler(Website = "http://www.glofile.com/", dataUrlfilter = "/[0-9]{4}/[0-9]{2}/",
ExtractXpathPat = c("//*/article","//*/h1"), PatternsNames = c("content","title"))
######### Crawl the website. and collect pages having URLs matching this regular expression
# pattern (/[0-9]{4}/[0-9]{2}/). Collected pages will be stored in a local repository
# named "myrepo". And The crawler stops After reaching the third level of website depth.
Rcrawler(Website = "http://www.example.com/", no_cores = 4, no_conn = 4,
dataUrlfilter = "/[0-9]{4}/[0-9]{2}/", DIR = "./myrepo", MaxDepth=3)
######### Crawl the website and collect/scrape only webpage related to a topic
# Crawl the website and collect pages containing keyword1 or keyword2 or both.
# To crawl a website and collect/scrape only some web pages related to a specific topic,
# like gathering posts related to Donald trump from a news website. Rcrawler function
# has two useful parameters KeywordsFilter and KeywordsAccuracy.
#
# KeywordsFilter : a character vector, here you should provide keywords/terms of the topic
# you are looking for. Rcrawler will calculate an accuracy score based on matched keywords
# and their occurrence on the page, then it collects or scrapes only web pages with at
# least a score of 1% wich mean at least one keyword is founded one time on the page.
# This parameter must be a vector with at least one keyword like c("mykeyword").
#
# KeywordsAccuracy: Integer value range between 0 and 100, used only in combination with
# KeywordsFilter parameter to determine the minimum accuracy of web pages to be collected
# /scraped. You can use one or more search terms; the accuracy will be calculated based on
# how many provided keywords are found on on the page plus their occurrence rate.
# For example, if only one keyword is provided c("keyword"), 50% means one occurrence of
# "keyword" in the page 100% means five occurrences of "keyword" in the page
Rcrawler(Website = "http://www.example.com/", KeywordsFilter = c("keyword1", "keyword2"))
# Crawl the website and collect webpages that has an accuracy percentage higher than 50%
# of matching keyword1 and keyword2.
Rcrawler(Website = "http://www.example.com/", KeywordsFilter = c("keyword1", "keyword2"),
KeywordsAccuracy = 50)
######### Crawl a website search results
# In the case of scraping web pages specific to a topic of your interest; The methods
# above has some disadvantages which are complexity and time consuming as the whole
# website need to be crawled and each page is analyzed to findout desired pages.
# As result you may want to make use of the search box of the website and then directly
# crawl only search result pages. To do so, you may use \code{crawlUrlfilter} and
# \code{dataUrlfilter} arguments or \code{crawlZoneCSSPat}/\code{CrawlZoneXPath} with
\code{dataUrlfilter}.
#- \code{crawlUrlfilter}:what urls shoud be crawled (followed).
#- \code{dataUrlfilter}: what urls should be collected (download HTML or extract data ).
#- \code{crawlZoneCSSPat} Or \code{CrawlZoneXPath}: the page section where links to be
crawled are located.
# Example1
# the command below will crawl all result pages knowing that result pages are like :
http://glofile.com/?s=sur
http://glofile.com/page/2/?s=sur
http://glofile.com/page/2/?s=sur
# so they all have "s=sur" in common
# Post pages should be crawled also, post urls are like
http://glofile.com/2017/06/08/placements-quelles-solutions-pour-dper/
http://glofile.com/2017/06/08/taux-nette-detente/
# which contain a date format march regex "[0-9]{4}/[0-9]{2}/[0-9]{2}
Rcrawler(Website = "http://glofile.com/?s=sur", no_cores = 4, no_conn = 4,
crawlUrlfilter = c("[0-9]{4}/[0-9]{2}/[0-9]d{2}","s=sur"))
# In addition by using dataUrlfilter we specify that :
# 1- only post pages should be collected/scraped not all crawled result pages
# 2- additional urls should not be retreived from post page
# (like post urls listed in 'related topic' or 'see more' sections)
Rcrawler(Website = "http://glofile.com/?s=sur", no_cores = 4, no_conn = 4,
crawlUrlfilter = c("[0-9]{4}/[0-9]{2}/[0-9]d{2}","s=sur"),
dataUrlfilter = "[0-9]{4}/[0-9]{2}/[0-9]{2}")
# Example 2
# collect job pages from indeed search result of "data analyst"
Rcrawler(Website = "https://www.indeed.com/jobs?q=data+analyst&l=Tampa,+FL",
no_cores = 4 , no_conn = 4,
crawlUrlfilter = c("/rc/","start="), dataUrlfilter = "/rc/")
# To include related post jobs on each collected post remove dataUrlfilter
# Example 3
# One other way to control the crawler behaviour, and to avoid fetching
# unnecessary links is to indicate to crawler the page zone of interest
# (a page section from where links should be grabed and crawled).
# The follwing example is similar to the last one,except this time we provide
# the xpath pattern of results search section to be crawled with all links within.
Rcrawler(Website = "https://www.indeed.com/jobs?q=data+analyst&l=Tampa,+FL",
no_cores = 4 , no_conn = 4,MaxDepth = 3,
crawlZoneXPath = c("//*[\@id='resultsCol']"), dataUrlfilter = "/rc/")
######### crawl and scrape a forum posts and replays, each page has a title and
# a list of replays , ExtractCSSPat = c("head>title","div[class=\"post\"]") .
# All replays have the same pattern, therfore we set TRUE ManyPerPattern
# to extract all of them.
Rcrawler(Website = "https://bitcointalk.org/", ManyPerPattern = TRUE,
ExtractCSSPat = c("head>title","div[class=\"post\"]"),
no_cores = 4, no_conn =4, PatternsName = c("Title","Replays"))
######### scrape data/collect pages meeting your custom criteria,
# This is useful when filetring by keyword or urls does not fullfil your needs, for example
# if you want to detect target pages by classification/prediction model, or simply by checking
# a sppecifi text value/field in the web page, you can create a custom filter function for
# page selection as follow.
# First will create and test our function and test it with un one page .
pageinfo<-LinkExtractor(url="http://glofile.com/index.php/2017/06/08/sondage-quel-budget/",
encod=encod, ExternalLInks = TRUE)
Customfilterfunc<-function(pageinfo){
decision<-FALSE
# put your conditions here
if(pageinfo$Info$Source_page ... ) ....
# then return a boolean value TRUE : should be collected / FALSE should be escaped
return TRUE or FALSE
}
# Finally, you just call it inside Rcrawler function, Then the crawler will evaluate each
page using your set of rules.
Rcrawler(Website = "http://glofile.com", no_cores=2, FUNPageFilter= Customfilterfunc )
######### Website Network
# Crawl the entire website, and create network edges DATA of internal links.
# Using Igraph for exmaple you can plot the network by the following commands
Rcrawler(Website = "http://glofile.com/" , no_cores = 4, no_conn = 4, NetworkData = TRUE)
library(igraph)
network<-graph.data.frame(NetwEdges, directed=T)
plot(network)
# Crawl the entire website, and create network edges DATA of internal and external links .
Rcrawler(Website = "http://glofile.com/" , no_cores = 4, no_conn = 4, NetworkData = TRUE,
NetwExtLinks = TRUE)
###### Crawl a website using a web driver (Vitural browser)
###########################################################################
## In some case you may need to retreive content from a web page which
## requires authentication via a login page like private forums, platforms..
## In this case you need to run \link{LoginSession} function to establish a
## authenticated browser session; then use \link{LinkExtractor} to fetch
## the URL using the auhenticated session.
## In the example below we will try to fech a private blog post which
## require authentification .
If you retreive the page using regular function LinkExtractor or your browser
page<-LinkExtractor("http://glofile.com/index.php/2017/06/08/jcdecaux/")
The post is not visible because it's private.
Now we will try to login to access this post using folowing creditentials
username : demo and password : rc@pass@r
#1 Download and install phantomjs headless browser (skip if installed)
install_browser()
#2 start browser process
br <-run_browser()
#3 create auhenticated session
# see \link{LoginSession} for more details
LS<-LoginSession(Browser = br, LoginURL = 'http://glofile.com/wp-login.php',
LoginCredentials = c('demo','rc@pass@r'),
cssLoginCredentials =c('#user_login', '#user_pass'),
cssLoginButton='#wp-submit' )
#check if login successful
LS$session$getTitle()
#Or
LS$session$getUrl()
#Or
LS$session$takeScreenshot(file = 'sc.png')
LS$session$getUrl()
LS<-run_browser()
LS<-LoginSession(Browser = LS, LoginURL = 'https://manager.submittable.com/login',
LoginCredentials = c('your email','your password'),
cssLoginFields =c('#email', '#password'),
XpathLoginButton ='//*[\@type=\"submit\"]' )
# page<-LinkExtractor(url='https://manager.submittable.com/beta/discover/119087',
LoggedSession = LS)
# cont<-ContentScraper(HTmlText = page$Info$Source_page,
XpathPatterns = c("//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[3]",
"//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[2]/div[1]/div[1]" ),
PatternsName = c("Article","Title"),astext = TRUE )
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab