redim = 2;
# Loading data
iris_data = readdlm("iris_data.csv");
iris_target = readdlm("iris_target.csv");
# Center data
iris_data = broadcast(-, iris_data, mean(iris_data, 1));
n_data, n_dim = size(iris_data);
Sw = zeros(n_dim, n_dim);
Sb = zeros(n_dim, n_dim);
C = cov(iris_data);
classes = unique(iris_target);
for i=1:length(classes)
index = find(x -> x==classes[i], iris_target);
d = iris_data[index,:];
classcov = cov(d);
Sw += length(index) / n_data .* classcov;
end
Sb = C - Sw;
evals, evecs = eig(Sw, Sb);
w = evecs[:,1:redim];
new_data = iris_data * w;
This code just does LDA (linear discriminant analysis) for the iris_data. Reduct the dimensions of the iris_data to 2. It will takes about 4 seconds, but Python(numpy/scipy) only takes about 0.6 seconds. Why?
This is from the first page, second paragraph of the introduction in the Julia Manual:
Because Julia’s compiler is different from the interpreters used for languages like Python or R, you may find that Julia’s performance is unintuitive at first. If you find that something is slow, we highly recommend reading through the Performance Tips section before trying anything else. Once you understand how Julia works, it’s easy to write code that’s nearly as fast as C.
Excerpt:
Avoid global variables
A global variable might have its value, and therefore its type, change at any point. This makes it difficult for the compiler to optimize code using global variables. Variables should be local, or passed as arguments to functions, whenever possible.
Any code that is performance critical or being benchmarked should be inside a function.
We find that global names are frequently constants, and declaring them as such greatly improves performance
Knowing that the script (all procedural top level code) style is so pervasive among many scientific computing users, I would recommend you to at least wrap the whole file inside a let
expression for starters (let introduces a new local scope), ie:
let
redim = 2
# Loading data
iris_data = readdlm("iris_data.csv")
iris_target = readdlm("iris_target.csv")
# Center data
iris_data = broadcast(-, iris_data, mean(iris_data, 1))
n_data, n_dim = size(iris_data)
Sw = zeros(n_dim, n_dim)
Sb = zeros(n_dim, n_dim)
C = cov(iris_data)
classes = unique(iris_target)
for i=1:length(classes)
index = find(x -> x==classes[i], iris_target)
d = iris_data[index,:]
classcov = cov(d)
Sw += length(index) / n_data .* classcov
end
Sb = C - Sw
evals, evecs = eig(Sw, Sb)
w = evecs[:,1:redim]
new_data = iris_data * w
end
But I would also urge you to refactor that into small functions and then compose a main
function that calls the rest, something like this, notice how this refactor makes your code general and reusable (and fast):
module LinearDiscriminantAnalysis
export load_data, center_data
"Returns data and target Matrices."
load_data(data_path, target_path) = (readdlm(data_path), readdlm(target_path))
function center_data(data, target)
data = broadcast(-, data, mean(data, 1))
n_data, n_dim = size(data)
Sw = zeros(n_dim, n_dim)
Sb = zeros(n_dim, n_dim)
C = cov(data)
classes = unique(target)
for i=1:length(classes)
index = find(x -> x==classes[i], target)
d = data[index,:]
classcov = cov(d)
Sw += length(index) / n_data .* classcov
end
Sb = C - Sw
evals, evecs = eig(Sw, Sb)
redim = 2
w = evecs[:,1:redim]
return data * w
end
end
using LinearDiscriminantAnalysis
function main()
iris_data, iris_target = load_data("iris_data.csv", "iris_target.csv")
result = center_data(iris_data, iris_target)
@show result
end
main()
Notes:
main
is just a name, it could be anything else you like.If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With