# }
######### Crawl, index, and store all pages of a websites using 4 cores and 4 parallel requests
Rcrawler(Website ="http://glofile.com/", no_cores = 4, no_conn = 4)
######### Crawl and index the website using 8 cores and 8 parallel requests with respect to
# robot.txt rules using Mozilla string in user agent.
Rcrawler(Website = "http://www.example.com/", no_cores=8, no_conn=8, Obeyrobots = TRUE,
Useragent="Mozilla 3.11")
######### Crawl the website using the default configuration and scrape specific data from
# the website, in this case we need all posts (articles and titles) matching two XPath patterns.
# we know that all blog posts have datesin their URLs like 2017/09/08 so to avoid
# collecting category or other pages we can tell the crawler that desired page's URLs
# are like 4-digit/2-digit/2-digit/ using regular expression.
# Note thatyou can use the excludepattern parameter to exclude a node from being
# extracted, e.g., in the case that a desired node includes (is a parent of) an
# undesired "child" node. (article having inner ads or menu)
Rcrawler(Website = "http://www.glofile.com/", dataUrlfilter = "/[0-9]{4}/[0-9]{2}/",
ExtractXpathPat = c("//*/article","//*/h1"), PatternsNames = c("content","title"))
######### Crawl the website. and collect pages having URLs matching this regular expression
# pattern (/[0-9]{4}/[0-9]{2}/). Collected pages will be stored in a local repository
# named "myrepo". And The crawler stops After reaching the third level of website depth.
Rcrawler(Website = "http://www.example.com/", no_cores = 4, no_conn = 4,
dataUrlfilter = "/[0-9]{4}/[0-9]{2}/", DIR = "./myrepo", MaxDepth=3)
######### Crawl the website and collect/scrape only webpage related to a topic
# Crawl the website and collect pages containing keyword1 or keyword2 or both.
# To crawl a website and collect/scrape only some web pages related to a specific topic,
# like gathering posts related to Donald trump from a news website. Rcrawler function
# has two useful parameters KeywordsFilter and KeywordsAccuracy.
# KeywordsFilter : a character vector, here you should provide keywords/terms of the topic
# you are looking for. Rcrawler will calculate an accuracy score based on matched keywords
# and their occurrence on the page, then it collects or scrapes only web pages with at
# least a score of 1% wich mean at least one keyword is founded one time on the page.
# This parameter must be a vector with at least one keyword like c("mykeyword").
# KeywordsAccuracy: Integer value range between 0 and 100, used only in combination with
# KeywordsFilter parameter to determine the minimum accuracy of web pages to be collected
# /scraped. You can use one or more search terms; the accuracy will be calculated based on
# how many provided keywords are found on on the page plus their occurrence rate.
# For example, if only one keyword is provided c("keyword"), 50% means one occurrence of
# "keyword" in the page 100% means five occurrences of "keyword" in the page
Rcrawler(Website = "http://www.example.com/", KeywordsFilter = c("keyword1", "keyword2"))
# Crawl the website and collect webpages that has an accuracy percentage higher than 50%
# of matching keyword1 and keyword2.
Rcrawler(Website = "http://www.example.com/", KeywordsFilter = c("keyword1", "keyword2"),
KeywordsAccuracy = 50)
######### Crawl a website search results
# In the case of scraping web pages specific to a topic of your interest; The methods
# above has some disadvantages which are complexity and time consuming as the whole
# website need to be crawled and each page is analyzed to findout desired pages.
# As result you may want to make use of the search box of the website and then directly
# crawl only search result pages. To do so, you may use \code{crawlUrlfilter} and
# \code{dataUrlfilter} arguments or \code{crawlZoneCSSPat}/\code{CrawlZoneXPath} with
#- \code{crawlUrlfilter}:what urls shoud be crawled (followed).
#- \code{dataUrlfilter}: what urls should be collected (download HTML or extract data ).
#- \code{crawlZoneCSSPat} Or \code{CrawlZoneXPath}: the page section where links to be
crawled are located.
# Example1
# the command below will crawl all result pages knowing that result pages are like :
# so they all have "s=sur" in common
# Post pages should be crawled also, post urls are like
# which contain a date format march regex "[0-9]{4}/[0-9]{2}/[0-9]{2}
Rcrawler(Website = "http://glofile.com/?s=sur", no_cores = 4, no_conn = 4,
crawlUrlfilter = c("[0-9]{4}/[0-9]{2}/[0-9]d{2}","s=sur"))
# In addition by using dataUrlfilter we specify that :
# 1- only post pages should be collected/scraped not all crawled result pages
# 2- additional urls should not be retreived from post page
# (like post urls listed in 'related topic' or 'see more' sections)
Rcrawler(Website = "http://glofile.com/?s=sur", no_cores = 4, no_conn = 4,
crawlUrlfilter = c("[0-9]{4}/[0-9]{2}/[0-9]d{2}","s=sur"),
dataUrlfilter = "[0-9]{4}/[0-9]{2}/[0-9]{2}")
# Example 2
# collect job pages from indeed search result of "data analyst"
Rcrawler(Website = "https://www.indeed.com/jobs?q=data+analyst&l=Tampa,+FL",
no_cores = 4 , no_conn = 4,
crawlUrlfilter = c("/rc/","start="), dataUrlfilter = "/rc/")
# To include related post jobs on each collected post remove dataUrlfilter
# Example 3
# One other way to control the crawler behaviour, and to avoid fetching
# unnecessary links is to indicate to crawler the page zone of interest
# (a page section from where links should be grabed and crawled).
# The follwing example is similar to the last one,except this time we provide
# the xpath pattern of results search section to be crawled with all links within.
Rcrawler(Website = "https://www.indeed.com/jobs?q=data+analyst&l=Tampa,+FL",
no_cores = 4 , no_conn = 4,MaxDepth = 3,
crawlZoneXPath = c("//*[\@id='resultsCol']"), dataUrlfilter = "/rc/")
######### crawl and scrape a forum posts and replays, each page has a title and
# a list of replays , ExtractCSSPat = c("head>title","div[class=\"post\"]") .
# All replays have the same pattern, therfore we set TRUE ManyPerPattern
# to extract all of them.
Rcrawler(Website = "https://bitcointalk.org/", ManyPerPattern = TRUE,
ExtractCSSPat = c("head>title","div[class=\"post\"]"),
no_cores = 4, no_conn =4, PatternsName = c("Title","Replays"))
######### scrape data/collect pages meeting your custom criteria,
# This is useful when filetring by keyword or urls does not fullfil your needs, for example
# if you want to detect target pages by classification/prediction model, or simply by checking
# a sppecifi text value/field in the web page, you can create a custom filter function for
# page selection as follow.
# First will create and test our function and test it with un one page .
encod=encod, ExternalLInks = TRUE)
# put your conditions here
if(pageinfo$Info$Source_page ... ) ....
# then return a boolean value TRUE : should be collected / FALSE should be escaped
return TRUE or FALSE
# Finally, you just call it inside Rcrawler function, Then the crawler will evaluate each
page using your set of rules.
Rcrawler(Website = "http://glofile.com", no_cores=2, FUNPageFilter= Customfilterfunc )
######### Website Network
# Crawl the entire website, and create network edges DATA of internal links.
# Using Igraph for exmaple you can plot the network by the following commands
Rcrawler(Website = "http://glofile.com/" , no_cores = 4, no_conn = 4, NetworkData = TRUE)
network<-graph.data.frame(NetwEdges, directed=T)
# Crawl the entire website, and create network edges DATA of internal and external links .
Rcrawler(Website = "http://glofile.com/" , no_cores = 4, no_conn = 4, NetworkData = TRUE,
NetwExtLinks = TRUE)
###### Crawl a website using a web driver (Vitural browser)
## In some case you may need to retreive content from a web page which
## requires authentication via a login page like private forums, platforms..
## In this case you need to run \link{LoginSession} function to establish a
## authenticated browser session; then use \link{LinkExtractor} to fetch
## the URL using the auhenticated session.
## In the example below we will try to fech a private blog post which
## require authentification .
If you retreive the page using regular function LinkExtractor or your browser
The post is not visible because it's private.
Now we will try to login to access this post using folowing creditentials
username : demo and password : rc@pass@r
#1 Download and install phantomjs headless browser (skip if installed)
#2 start browser process
br <-run_browser()
#3 create auhenticated session
# see \link{LoginSession} for more details
LS<-LoginSession(Browser = br, LoginURL = 'http://glofile.com/wp-login.php',
LoginCredentials = c('demo','rc@pass@r'),
cssLoginCredentials =c('#user_login', '#user_pass'),
cssLoginButton='#wp-submit' )
#check if login successful
LS$session$takeScreenshot(file = 'sc.png')
LS<-LoginSession(Browser = LS, LoginURL = 'https://manager.submittable.com/login',
LoginCredentials = c('your email','your password'),
cssLoginFields =c('#email', '#password'),
XpathLoginButton ='//*[\@type=\"submit\"]' )
# page<-LinkExtractor(url='https://manager.submittable.com/beta/discover/119087',
LoggedSession = LS)
# cont<-ContentScraper(HTmlText = page$Info$Source_page,
XpathPatterns = c("//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[3]",
"//*[\@id=\"submitter-app\"]/div/div[2]/div/div/div/div/div[2]/div[1]/div[1]" ),
PatternsName = c("Article","Title"),astext = TRUE )
# }
# }
Run the code above in your browser using DataLab