Let’s play with some known datasets.
Spiral dataset
Import data from this file and extract a subset of 10000 points
using DelimitedFiles, Random;
using AlgebraOfGraphics, GLMakie;
import GeometricDatasets as gd;
using ToMATo
X = readdlm("datasets/spiral.txt");
seed = MersenneTwister(0);
ids = rand(seed, 1:size(X)[1], 15_000)
X = X[ids, :]' |> Matrix;
X
2×15000 Matrix{Float64}:
937.984 1517.26 814.057 1377.31 … 1343.56 1703.04 478.709 1406.98
259.679 989.4 1721.36 242.761 485.03 928.448 611.633 1726.12
plot \(X\)
calculate the density
ds = gd.density_estimation(X, h = 100)
df = (x1 = X[1, :], x2 = X[2, :], ds = ds)
plt = data(df) * mapping(:x1, :x2, color = :ds)
draw(plt)
plot it in 3d
axis = (type = Axis3, width = 800, height = 450)
df = (x1 = X[1, :], x2 = X[2, :], ds = ds)
plt = data(df) * mapping(:x1, :x2, :ds, color = :ds)
draw(plt; axis = axis)
calculate the proximity graph \(g\)
g = proximity_graph(
X
, epsilon_ball_or_knn(30, min_ball_points = 10, max_ball_points = 25, knn_points = 10)
)
{15000, 150840} undirected simple Int64 graph
estimate \(\tau\)
_, births_and_deaths = tomato(X, g, ds, Inf)
plot_births_and_deaths(births_and_deaths)
choose \(\tau = 0.01\), calculate the ToMATo clustering
τ = 0.01
clusters, _ = tomato(X, g, ds, τ, max_cluster_height = τ);
and plot it
axis = (type = Axis, width = 800, height = 450)
df = (x1 = X[1, :], x2 = X[2, :], cluster = clusters .|> string)
plt = data(df) * mapping(:x1, :x2, color = :cluster)
draw(plt; axis = axis)
Toy example
The next example can be found in this link
X = readdlm("datasets/toy_example.txt");
X = X' |> Matrix;
X
2×10000 Matrix{Float64}:
0.392345 5.40588 0.215228 2.55846 … 8.62843 2.99077 1.11412 5.19651
1.24778 2.40142 1.26093 7.2899 9.36162 7.13162 8.71054 7.12939
ds = gd.density_estimation(X, h = 0.5)
df = (x1 = X[1, :], x2 = X[2, :], ds = ds)
plt = data(df) * mapping(:x1, :x2, color = :ds)
draw(plt)
axis = (type = Axis3, width = 800, height = 450)
df = (x1 = X[1, :], x2 = X[2, :], ds = ds)
plt = data(df) * mapping(:x1, :x2, :ds, color = :ds)
draw(plt; axis = axis)
g = proximity_graph(
X
, epsilon_ball_or_knn(0.2, min_ball_points = 5, max_ball_points = 10, knn_points = 5)
)
{10000, 68200} undirected simple Int64 graph
_, births_and_deaths = tomato(X, g, ds, Inf)
plot_births_and_deaths(births_and_deaths)
τ = 0.005
clusters, _ = tomato(X, g, ds, τ, max_cluster_height = τ);
axis = (type = Axis, width = 800, height = 450)
df = (x1 = X[1, :], x2 = X[2, :], cluster = clusters .|> string)
plt = data(df) * mapping(:x1, :x2, color = :cluster)
draw(plt; axis = axis)