if (FALSE) ###dontrunbegin
####################################
######### Artificial data ##########
####################################
# Set VM size for Java
options(java.parameters = "-Xmx8g")
library(rmcfs)
# create input data and review it
adata <- artificial.data(rnd_features = 10)
showme(adata)
# Parametrize and run MCFS-ID procedure
result <- mcfs(class~., adata, cutoffPermutations = 3, featureFreq = 50,
buildID = TRUE, finalCV = FALSE, finalRuleset = FALSE,
threadsNumber = 2)
# Print basic information about mcfs result
print(result)
# Review cutoff values for all methods
print(result$cutoff)
# Review cutoff value used in plots
print(result$cutoff_value)
# Plot & print out distances between subsequent projections.
# These are convergence MCFS-ID statistics.
plot(result, type = "distances")
print(result$distances)
# Plot & print out 50 most important features and show max RI values from
# permutation experiment.
plot(result, type = "ri", size = 50)
print(head(result$RI, 50))
# Plot & print out 50 strongest feature interdependencies.
plot(result, type = "id", size = 50)
print(head(result$ID, 50))
# Plot features ordered by RI. Parameter 'size' is the number of
# top features in the chart. By default it is set on cutoff_value + 10
plot(result, type = "features", cex = 1)
# Here we set 'size' at fixed value 10.
plot(result, type = "features", size = 10)
# Plot cv classification result obtained on top features.
# In the middle of x axis red label denotes cutoff_value.
# plot(result, type = "cv", cv_measure = "wacc", cex = 0.8)
# Plot & print out confusion matrix. This matrix is the result of
# all classifications performed by all decision trees on all s*t datasets.
plot(result, type = "cmatrix")
# build interdependencies graph (all default parameters).
gid <- build.idgraph(result)
plot(gid, label_dist = 1)
# build interdependencies graph for top 6 features
# and top 12 interdependencies and plot all nodes
gid <- build.idgraph(result, size = 6, size_ID = 12, orphan_nodes = TRUE)
plot(gid, label_dist = 1)
# Export graph to graphML (XML structure)
path <- tempdir()
igraph::write_graph(gid, file = file.path(path, "artificial.graphml"),
format = "graphml", prefixAttr = FALSE)
# Export and import results to/from csv files
export.result(result, path = path, label = "artificial")
result <- import.result(path = path, label = "artificial")
# Find out how many trees with the given attribute has been built (and nodes based the
# attribute in total). Notice that result$RI$projections keeps the number of subsets where
# the feature was randomly picked. The value: result$RI$projections*result$params$mcfs.splits
# is the total number of trees for a given attribute that could be built based on the attribute.
# This normalization takes into the consideration not the full number of st trees
# but only the fraction that is trained on datasets with the attribute.
result$RI$classifiers*(result$RI$projections*result$params$mcfs.splits)
result$RI$nodes*(result$RI$projections*result$params$mcfs.splits)
####################################
########## Alizadeh data ###########
####################################
# Load Alizadeh dataset.
# A 4026 x 62 gene expression data matrix of log-ratio values. The last column contains
# the annotations of the 62 samples with respect to the cancer types C, D, F.
# The data are from the lymphoma/leukemia study of A. Alizadeh et al., Nature 403:503-511 (2000),
# http://llmpp.nih.gov/lymphoma/index.shtml
alizadeh <- read.csv(file="http://home.ipipan.waw.pl/m.draminski/files/data/alizadeh.csv",
stringsAsFactors = FALSE)
showme(alizadeh)
# Fix data types and data values - replace characters such as "," " " "/" etc.
# from values and column names and fix data types
# This function may help if mcfs has any problems with input data
alizadeh <- fix.data(alizadeh)
# Run MCFS-ID procedure on default parameters.
# For larger real data (thousands of features) default 'auto' settings are the best.
# This example may take 10-20 minutes but this one is a real dataset with 4026 features.
# Set up more threads according to your CPU cores number.
result <- mcfs(class~., alizadeh, featureFreq = 100, cutoffPermutations = 10, threadsNumber = 8)
# Print basic information about mcfs result.
print(result)
# Plot & print out distances between subsequent projections.
plot(result, type="distances")
# Show RI values for top 500 features and max RI values from permutation experiment.
plot(result, type = "ri", size = 500)
# Plot heatmap on top features, only numeric features are presented
plot(result, type = "heatmap", size = 20, heatmap_norm = 'norm', heatmap_fun = 'median')
# Plot cv classification result obtained on top features.
# In the middle of x axis red label denotes cutoff_value.
plot(result, type = "cv", cv_measure = "wacc", cex = 0.8)
# build interdependencies graph.
gid <- build.idgraph(result, size = 20)
plot.idgraph(gid, label_dist = 0.3)
###dontrunend
Run the code above in your browser using DataLab