You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

488 lines
14 KiB

### A Pluto.jl notebook ###
# v0.12.20
using Markdown
using InteractiveUtils
# ╔═╡ e8322e0e-6bd2-11eb-3c59-032d9259f937
begin
using Dates
using CSV, Statistics
#using DataFrames
using Plots
#using AutoMLPipeline
using Flux
using CUDA
using MultivariateStats
using Distributed
using ScikitLearn
using JLBoost, JLBoostMLJ, MLJ
end
# ╔═╡ 87c0f780-6c4f-11eb-28d0-3dedf07b9c50
begin
@sync @everywhere using AutoMLPipeline
@sync @everywhere using DataFrames
end
# ╔═╡ ce121a34-6c59-11eb-019d-0bf399586199
begin
using RDatasets;
iris = dataset("datasets", "iris");
iris[!, :is_setosa] = iris.Species .== "setosa";
Xi, yi = unpack(iris, x->!(x in [:is_setosa, :Species]), ==(:is_setosa));
end
# ╔═╡ d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8
begin
using Lathe.models: RandomForestClassifier
using StatsBase
end
# ╔═╡ 962ac8c4-6be0-11eb-3865-f5dce353b241
splitLabel(label) = split(label, " ")[1]
# ╔═╡ 75736b1c-6bd5-11eb-08a7-630cd343e6bb
function onehot_df(training::DataFrame, key::String, insert::Bool)
if !insert
return select(training, Not(key))
end
labels = sort(unique(training[!, key]))
oh_key = Flux.onehotbatch(training[!, key], labels)
return hcat(select(training, Not(key)), DataFrame(oh_key', (key[1:4]*"_") .* splitLabel.(labels)))
end
# ╔═╡ 90803e02-6bdd-11eb-0c9e-992ccc13cf4c
dateConv(d) = Dates.DateTime(d, "yyyy-mm-dd HH:MM:SS")
# ╔═╡ 50e1ff74-6be2-11eb-0f36-458c037d64fc
convertDay(array) = Dates.value.(convert.(Dates.Day, round.(array, Dates.Day)))
# ╔═╡ 29fdbcde-6bda-11eb-2d0e-f737cdf5d019
begin
training = DataFrame(CSV.File("training_SC_GGP_AXA_FR.csv"))
# Date conversion
# Missing
today = Dates.now()
today_string = Dates.format(today, "yyyy-mm-dd HH:MM:SS")
replace!(training.ouverture_dernier_sinistre, missing => today_string)
replace!(training.cloture_dernier_sinistre, missing => today_string)
for key in ["debut_contrat", "ouverture_dernier_sinistre", "cloture_dernier_sinistre"]
training[!, key] = dateConv.(training[!, key])
end
# Add duree
insertcols!(training, "duree_contrat" => convertDay(today - training.debut_contrat))
insertcols!(training, "duree_dernier_sinistre" => convertDay(training.cloture_dernier_sinistre - training.ouverture_dernier_sinistre))
insertcols!(training, "duree_zero_sinistre" => convertDay(today - training.cloture_dernier_sinistre))
select!(training, Not("debut_contrat"))
select!(training, Not("ouverture_dernier_sinistre"))
select!(training, Not("cloture_dernier_sinistre"))
# One hot encode categorical
training = onehot_df(training, "departement", true)
training = onehot_df(training, "categorie_socio_professionnelle", true)
training = onehot_df(training, "type_de_bien", true)
training = onehot_df(training, "statut", true)
select!(training, Not("index"))
training = Float32.(training)
end
# ╔═╡ 4d58899e-6bed-11eb-2a00-674d22246165
begin
using Flux.Data: DataLoader
using Flux: onehotbatch, onecold, logitcrossentropy, throttle, @epochs
using Base.Iterators: repeated
using Parameters: @with_kw
using MLDatasets
if has_cuda() # Check if CUDA is available
@info "CUDA is on"
CUDA.allowscalar(false)
end
η = 1e-5 # learning rate
batchsize = 1000 # batch size
epochs = 1000 # number of epochs
device = cpu # set as gpu, if gpu available
function getdata()
ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
# Loading Dataset
xtrain = permutedims(Array(Float32.(select(training, Not("target")))))
ytrain = permutedims(Array(Float32.(select(training, "target"))))
xtest, ytest = xtrain, ytrain
#xtrain, ytrain = MLDatasets.MNIST.traindata(Float32)
#xtest, ytest = MLDatasets.MNIST.testdata(Float32)
@show(size(xtrain), size(ytrain))
@show(typeof(xtrain), typeof(ytrain))
# Reshape Data in order to flatten each image into a linear array
#xtrain = Flux.flatten(xtrain)
#xtest = Flux.flatten(xtest)
ytrain = dropdims(ytrain; dims = 1)
ytest = dropdims(ytest; dims = 1)
@show(size(xtrain), size(ytrain))
@show(typeof(xtrain), typeof(ytrain))
# One-hot-encode the labels
ytrain, ytest = onehotbatch(ytrain, 0:1), onehotbatch(ytest, 0:1)
#ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)
@show(size(xtrain), size(ytrain))
@show(typeof(xtrain), typeof(ytrain))
# Batching
train_data = DataLoader(xtrain, ytrain, batchsize=batchsize, shuffle=true)
test_data = DataLoader(xtest, ytest, batchsize=batchsize)
return train_data, test_data
end
function build_model(; imgsize=(20,1), nclasses=2, hidden=1024)
return Chain(
Dense(prod(imgsize), hidden, relu),
Dense(hidden, hidden, relu),
Dense(hidden, nclasses))
end
function loss_all(dataloader, model)
l = 0f0
for (x,y) in dataloader
l += logitcrossentropy(model(x), y)
end
l/length(dataloader)
end
function accuracy(data_loader, model)
acc = 0
for (x,y) in data_loader
acc += sum(onecold(cpu(model(x))) .== onecold(cpu(y)))*1 / size(x,2)
end
acc/length(data_loader)
end
function focal_loss(yh, y)
ce_loss = Flux.crossentropy(yh, y; agg=identity)
pt = exp.(-ce_loss)
gamma = 100
return mean(((1 .- pt) .^ gamma) .* ce_loss)
end
function train()
# Initializing Model parameters
# Load Data
train_data,test_data = getdata()
# Construct model
m = build_model()
train_data = device.(train_data)
test_data = device.(test_data)
m = device(m)
#loss(x,y) = focal_loss(m(x), y)
loss(x,y) = logitcrossentropy(m(x), y)
## Training
evalcb = () -> @show(loss_all(train_data, m), accuracy(train_data, m))
opt = ADAM(η)
#Flux.@epochs epochs Flux.train!(loss, params(m), train_data, opt, cb = evalcb)
#@show accuracy(train_data, m)
#@show accuracy(test_data, m)
end
train()
end
# ╔═╡ 5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d
begin
using Lathe.preprocess: TrainTestSplit
using Lathe.lstats: catacc
dtrain,dtest = TrainTestSplit(training)
end
# ╔═╡ c7ca7334-6bdb-11eb-2413-f50b8cd50f89
size(training)
# ╔═╡ b69d0732-6bd8-11eb-0144-39023b4d5326
describe(training)
# ╔═╡ 964d8b78-6bf6-11eb-1129-9b6e7d2c21d1
#begin
# X = permutedims(Array(Float32.(select(training, Not("target")))))#[:, 1:100]# |> gpu
# Y = permutedims(Array(Float32.(select(training, "target"))))#[:, 1:100]# |> gpu
# Y = onehotbatch(dropdims(Y; dims = 1), 0:1)
#m = Chain(
# Dense(size(X)[1], 32, relu),
#Dense(32, 2),
#softmax)# |> gpu
# function loss(x, y)
# return Flux.logitcrossentropy(m(x), y)
# end
# function Accuracy(x, y)
# a = Flux.onecold(m(x))
# b = Flux.onecold(y)# |> gpu #### If this is not there, it beceomes a julia array
# @show(size(a), size(b))
# return mean(a .== b)
# end
# dataset = Iterators.repeated((X, Y), 10)
# evalcb = () -> @show(loss(X, Y), Accuracy(X, Y))
# opt = ADAM(0.001, (0.9, 0.999))
# Flux.@epochs 100 Flux.train!(loss, params(m), dataset, opt, cb = Flux.throttle(evalcb, 10))
# Accuracy(X, Y), loss(X, Y)
#end
# ╔═╡ 4a55b5fa-6c47-11eb-1b2d-95ae61217361
begin
X = Float32.(select(training, Not("target")))
Y = Float32.(training[!, "target"]) |> Vector
end
# ╔═╡ e250e5da-6c49-11eb-1005-8567cf37eaf8
begin
ppca = SKPreprocessor("PCA");
pnumf = NumFeatureSelector();
prb = SKPreprocessor("RobustScaler");
pohePCA = @pipeline pnumf |> prb |> ppca
trPCA = fit_transform!(pohePCA,X,Y)
scatter(trPCA[:, 1], trPCA[:, 2], markersize=3*Y.+1)
end
# ╔═╡ e9e72212-6c4b-11eb-0f57-9bd831b4dd80
begin
prf = SKLearner("RandomForestClassifier",Dict(:impl_args=>Dict(:n_estimators => 100)))
rfp1 = @pipeline pnumf |> prb |> prf;
crossvalidate(rfp1, X,Y)
end
# ╔═╡ 3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1
begin
# from discourse discussion with zevelev
#addprocs()
#@everywhere using AutoMLPipeline, DataFrames
#Get models.
sk= AutoMLPipeline.SKLearners.learner_dict |> keys |> collect;
sk= sk |> x-> sort(x,lt=(x,y)->lowercase(x)<lowercase(y));
m_cl= sk[occursin.("Classifier", sk)];
m_cl= m_cl sk[occursin.("NB", sk)];
m_cl= m_cl sk[occursin.("SVC", sk)];
m_cl= m_cl ["LDA", "QDA"];
# find optimal learners
learners = @distributed (vcat) for m in m_cl
learner = SKLearner(m)
pcmc = AutoMLPipeline.@pipeline learner
println(learner.name)
mean,sd,folds,err = crossvalidate(pcmc,X,Y,"accuracy_score",5)
if !isnan(mean)
DataFrame(name=learner.name,mean=mean,sd=sd,folds=folds,errors=err)
else
DataFrame()
end
end;
sort!(learners,:mean,rev=true)
@show learners;
# optimized C
#results=@distributed (vcat) for C in 1:5
# @distributed (vcat) for gamma = 1:5
# svcmodel = SKLearner("SVC",Dict(:impl_args=>Dict(:kernel=>"rbf",:C=>C,:gamma=>gamma) ))
# mn,sd,fld,err = crossvalidate(svcmodel,X,Y)
#DataFrame(name=svcmodel.name,mean=mn,sd=sd,C=C,gamma=gamma,folds=fld,errors=err)
# end
#end
#sort!(results,:mean,rev=true)
#@show results
# search best learner by crossvalidation and use it for prediction
#learners = SKLearner.(["AdaBoostClassifier","BaggingClassifier","SGDClassifier","SVC","LinearSVC"])
#blearner = BestLearner(learners)
#crossvalidate(blearner,X,Y,"accuracy_score")
#fit!(blearner,X,Y)
end
# ╔═╡ 996a1126-6c4d-11eb-34fc-c97f0636695d
learners[1, :]
# ╔═╡ 6eb62f38-6c4f-11eb-0d56-975521d7d224
begin
# Add workers
nprocs() == 1 && addprocs();
workers()
#### feature selectors
catf = CatFeatureSelector();
numf = NumFeatureSelector();
# hot-bit encoder
ohe = AutoMLPipeline.OneHotEncoder();
#### feature scalers
rb = SKPreprocessor("RobustScaler");
pt = SKPreprocessor("PowerTransformer");
mx = SKPreprocessor("MinMaxScaler");
std = SKPreprocessor("StandardScaler");
norm = SKPreprocessor("Normalizer");
#### feature extractors
#pca = SKPreprocessor("PCA", Dict(:autocomponent => true));
#ica = SKPreprocessor("FastICA", Dict(:autocomponent => true));
#fa = SKPreprocessor("FactorAnalysis", Dict(:autocomponent => true));
#### Learners
rf = SKLearner("RandomForestClassifier", Dict(:impl_args => Dict(:n_estimators => 10)));
gb = SKLearner("GradientBoostingClassifier");
lsvc = SKLearner("LinearSVC");
mlp = SKLearner("MLPClassifier");
stack = StackEnsemble();
rbfsvc = SKLearner("SVC");
ada = SKLearner("AdaBoostClassifier");
vote = VoteEnsemble();
best = BestLearner();
tree = PrunedTree();
sgd = SKLearner("SGDClassifier");
noop = Identity(Dict(:name => "Noop"));
# Parallel Search for Datamining Optimal Pipelines
function prpsearch()
learners = [rf,ada,sgd,tree,rbfsvc,lsvc,gb];
scalers = [rb,pt,norm,std,mx,noop];
dftable = @sync @distributed (vcat) for lr in learners
@distributed (vcat) for sc in scalers
pipe = AutoMLPipeline.@pipeline (catf |> ohe) + (numf |> sc ) |> lr
scn = sc.name[1:end - 4]; lrn = lr.name[1:end - 4]
pname = "$scn |> $lrn"
ptime = @elapsed begin
mean, sd, kfold, _ = crossvalidate(pipe, X, Y, "accuracy_score", 5)
end
DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
end
end
sort!(dftable, :mean, rev=true);
dftable
end
runtime = @elapsed begin
#df = prpsearch()
end;
#serialtime = df.time |> sum;
#(serialtime = "$(round(serialtime / 60.0)) minutes", paralleltime = "$(round(runtime)) seconds")
# pipeline performances
#@show df
end
# ╔═╡ 22b11720-6c50-11eb-3e44-c726d18c6948
df
# ╔═╡ e3d0797a-6c53-11eb-1923-c719d8210533
gb
# ╔═╡ cd229ebc-6c56-11eb-1f44-b3ce2e8e0875
begin
jlb = JLBoostClassifier()
r1 = range(jlb, :nrounds, lower=1, upper = 6)
r2 = range(jlb, :max_depth, lower=1, upper = 6)
r3 = range(jlb, :eta, lower=0.1, upper=1.0)
tm = TunedModel(model = jlb, ranges = [r1, r2, r3], measure = cross_entropy)
XX = X
YY = categorical(Bool.(Y))
m = machine(tm, XX, YY)
end
# ╔═╡ 13522512-6c5a-11eb-2278-2d6bda47e9a1
parent(YY)
# ╔═╡ 331af5c8-6c59-11eb-2dbd-c3a9b8127f97
MLJ.fit!(m)
# ╔═╡ 232e07a2-6c5b-11eb-15e2-2b94b7f934dc
X
# ╔═╡ 24c4b282-6c5b-11eb-3a32-3936b6fde11e
Y
# ╔═╡ 5e8e1a92-6c5b-11eb-3a34-a5d8452288b5
training
# ╔═╡ a98857da-6c5b-11eb-3d4e-75e9ad20b4fc
dtest
# ╔═╡ 834f2260-6c5b-11eb-3577-3d0ffc913da6
begin
trainX = (select(dtrain, Not("target")))
trainy = dtrain[!, "target"]
testX = (select(dtest, Not("target")))
testy = dtest[!, "target"]
end
# ╔═╡ dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7
begin
model = RandomForestClassifier(trainX, trainy, n_trees = 10, max_depth = 11)
yhat = model.predict(testX)
Set(yhat), catacc(yhat, testy)
end
# ╔═╡ eb259a9e-6c5c-11eb-231a-c37636e42302
begin
Booster = @load EvoTreeRegressor
booster = Booster(max_depth=2) # specify hyperparamter at construction
booster.nrounds=50 # or mutate post facto
end
# ╔═╡ f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc
# ╔═╡ f70c90ee-6c5c-11eb-355e-454a4b2f89d9
# ╔═╡ ffce310e-6c5c-11eb-2025-5f17f3cbed51
# ╔═╡ 091bbe02-6c5d-11eb-2979-75c5cab5b613
# ╔═╡ Cell order:
# ╠═e8322e0e-6bd2-11eb-3c59-032d9259f937
# ╠═87c0f780-6c4f-11eb-28d0-3dedf07b9c50
# ╠═962ac8c4-6be0-11eb-3865-f5dce353b241
# ╠═75736b1c-6bd5-11eb-08a7-630cd343e6bb
# ╠═90803e02-6bdd-11eb-0c9e-992ccc13cf4c
# ╠═50e1ff74-6be2-11eb-0f36-458c037d64fc
# ╠═29fdbcde-6bda-11eb-2d0e-f737cdf5d019
# ╠═c7ca7334-6bdb-11eb-2413-f50b8cd50f89
# ╠═b69d0732-6bd8-11eb-0144-39023b4d5326
# ╠═964d8b78-6bf6-11eb-1129-9b6e7d2c21d1
# ╠═4d58899e-6bed-11eb-2a00-674d22246165
# ╠═4a55b5fa-6c47-11eb-1b2d-95ae61217361
# ╠═e250e5da-6c49-11eb-1005-8567cf37eaf8
# ╠═e9e72212-6c4b-11eb-0f57-9bd831b4dd80
# ╠═3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1
# ╠═996a1126-6c4d-11eb-34fc-c97f0636695d
# ╠═6eb62f38-6c4f-11eb-0d56-975521d7d224
# ╠═22b11720-6c50-11eb-3e44-c726d18c6948
# ╠═e3d0797a-6c53-11eb-1923-c719d8210533
# ╠═cd229ebc-6c56-11eb-1f44-b3ce2e8e0875
# ╠═13522512-6c5a-11eb-2278-2d6bda47e9a1
# ╠═ce121a34-6c59-11eb-019d-0bf399586199
# ╠═331af5c8-6c59-11eb-2dbd-c3a9b8127f97
# ╠═d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8
# ╠═232e07a2-6c5b-11eb-15e2-2b94b7f934dc
# ╠═24c4b282-6c5b-11eb-3a32-3936b6fde11e
# ╠═5e8e1a92-6c5b-11eb-3a34-a5d8452288b5
# ╠═5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d
# ╠═a98857da-6c5b-11eb-3d4e-75e9ad20b4fc
# ╠═834f2260-6c5b-11eb-3577-3d0ffc913da6
# ╠═dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7
# ╠═eb259a9e-6c5c-11eb-231a-c37636e42302
# ╠═f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc
# ╠═f70c90ee-6c5c-11eb-355e-454a4b2f89d9
# ╠═ffce310e-6c5c-11eb-2025-5f17f3cbed51
# ╠═091bbe02-6c5d-11eb-2979-75c5cab5b613