489 lines
14 KiB
Julia
489 lines
14 KiB
Julia
### A Pluto.jl notebook ###
|
||
# v0.12.20
|
||
|
||
using Markdown
|
||
using InteractiveUtils
|
||
|
||
# ╔═╡ e8322e0e-6bd2-11eb-3c59-032d9259f937
|
||
begin
|
||
using Dates
|
||
using CSV, Statistics
|
||
#using DataFrames
|
||
using Plots
|
||
#using AutoMLPipeline
|
||
using Flux
|
||
using CUDA
|
||
using MultivariateStats
|
||
using Distributed
|
||
using ScikitLearn
|
||
using JLBoost, JLBoostMLJ, MLJ
|
||
end
|
||
|
||
# ╔═╡ 87c0f780-6c4f-11eb-28d0-3dedf07b9c50
|
||
begin
|
||
@sync @everywhere using AutoMLPipeline
|
||
@sync @everywhere using DataFrames
|
||
end
|
||
|
||
# ╔═╡ ce121a34-6c59-11eb-019d-0bf399586199
|
||
begin
|
||
using RDatasets;
|
||
iris = dataset("datasets", "iris");
|
||
iris[!, :is_setosa] = iris.Species .== "setosa";
|
||
|
||
Xi, yi = unpack(iris, x->!(x in [:is_setosa, :Species]), ==(:is_setosa));
|
||
|
||
end
|
||
|
||
# ╔═╡ d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8
|
||
begin
|
||
using Lathe.models: RandomForestClassifier
|
||
using StatsBase
|
||
end
|
||
|
||
# ╔═╡ 962ac8c4-6be0-11eb-3865-f5dce353b241
|
||
splitLabel(label) = split(label, " ")[1]
|
||
|
||
# ╔═╡ 75736b1c-6bd5-11eb-08a7-630cd343e6bb
|
||
function onehot_df(training::DataFrame, key::String, insert::Bool)
|
||
if !insert
|
||
return select(training, Not(key))
|
||
end
|
||
labels = sort(unique(training[!, key]))
|
||
oh_key = Flux.onehotbatch(training[!, key], labels)
|
||
return hcat(select(training, Not(key)), DataFrame(oh_key', (key[1:4]*"_") .* splitLabel.(labels)))
|
||
end
|
||
|
||
# ╔═╡ 90803e02-6bdd-11eb-0c9e-992ccc13cf4c
|
||
dateConv(d) = Dates.DateTime(d, "yyyy-mm-dd HH:MM:SS")
|
||
|
||
# ╔═╡ 50e1ff74-6be2-11eb-0f36-458c037d64fc
|
||
convertDay(array) = Dates.value.(convert.(Dates.Day, round.(array, Dates.Day)))
|
||
|
||
# ╔═╡ 29fdbcde-6bda-11eb-2d0e-f737cdf5d019
|
||
begin
|
||
training = DataFrame(CSV.File("training_SC_GGP_AXA_FR.csv"))
|
||
|
||
# Date conversion
|
||
# Missing
|
||
today = Dates.now()
|
||
today_string = Dates.format(today, "yyyy-mm-dd HH:MM:SS")
|
||
replace!(training.ouverture_dernier_sinistre, missing => today_string)
|
||
replace!(training.cloture_dernier_sinistre, missing => today_string)
|
||
for key in ["debut_contrat", "ouverture_dernier_sinistre", "cloture_dernier_sinistre"]
|
||
training[!, key] = dateConv.(training[!, key])
|
||
end
|
||
# Add duree
|
||
insertcols!(training, "duree_contrat" => convertDay(today - training.debut_contrat))
|
||
insertcols!(training, "duree_dernier_sinistre" => convertDay(training.cloture_dernier_sinistre - training.ouverture_dernier_sinistre))
|
||
insertcols!(training, "duree_zero_sinistre" => convertDay(today - training.cloture_dernier_sinistre))
|
||
select!(training, Not("debut_contrat"))
|
||
select!(training, Not("ouverture_dernier_sinistre"))
|
||
select!(training, Not("cloture_dernier_sinistre"))
|
||
|
||
# One hot encode categorical
|
||
training = onehot_df(training, "departement", true)
|
||
training = onehot_df(training, "categorie_socio_professionnelle", true)
|
||
training = onehot_df(training, "type_de_bien", true)
|
||
training = onehot_df(training, "statut", true)
|
||
|
||
select!(training, Not("index"))
|
||
training = Float32.(training)
|
||
end
|
||
|
||
# ╔═╡ 4d58899e-6bed-11eb-2a00-674d22246165
|
||
begin
|
||
using Flux.Data: DataLoader
|
||
using Flux: onehotbatch, onecold, logitcrossentropy, throttle, @epochs
|
||
using Base.Iterators: repeated
|
||
using Parameters: @with_kw
|
||
using MLDatasets
|
||
if has_cuda() # Check if CUDA is available
|
||
@info "CUDA is on"
|
||
CUDA.allowscalar(false)
|
||
end
|
||
|
||
η = 1e-5 # learning rate
|
||
batchsize = 1000 # batch size
|
||
epochs = 1000 # number of epochs
|
||
device = cpu # set as gpu, if gpu available
|
||
|
||
function getdata()
|
||
ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
|
||
|
||
# Loading Dataset
|
||
xtrain = permutedims(Array(Float32.(select(training, Not("target")))))
|
||
ytrain = permutedims(Array(Float32.(select(training, "target"))))
|
||
xtest, ytest = xtrain, ytrain
|
||
#xtrain, ytrain = MLDatasets.MNIST.traindata(Float32)
|
||
#xtest, ytest = MLDatasets.MNIST.testdata(Float32)
|
||
@show(size(xtrain), size(ytrain))
|
||
@show(typeof(xtrain), typeof(ytrain))
|
||
|
||
# Reshape Data in order to flatten each image into a linear array
|
||
#xtrain = Flux.flatten(xtrain)
|
||
#xtest = Flux.flatten(xtest)
|
||
ytrain = dropdims(ytrain; dims = 1)
|
||
ytest = dropdims(ytest; dims = 1)
|
||
@show(size(xtrain), size(ytrain))
|
||
@show(typeof(xtrain), typeof(ytrain))
|
||
|
||
# One-hot-encode the labels
|
||
ytrain, ytest = onehotbatch(ytrain, 0:1), onehotbatch(ytest, 0:1)
|
||
#ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)
|
||
@show(size(xtrain), size(ytrain))
|
||
@show(typeof(xtrain), typeof(ytrain))
|
||
|
||
# Batching
|
||
train_data = DataLoader(xtrain, ytrain, batchsize=batchsize, shuffle=true)
|
||
test_data = DataLoader(xtest, ytest, batchsize=batchsize)
|
||
|
||
return train_data, test_data
|
||
end
|
||
|
||
function build_model(; imgsize=(20,1), nclasses=2, hidden=1024)
|
||
return Chain(
|
||
Dense(prod(imgsize), hidden, relu),
|
||
Dense(hidden, hidden, relu),
|
||
Dense(hidden, nclasses))
|
||
end
|
||
|
||
function loss_all(dataloader, model)
|
||
l = 0f0
|
||
for (x,y) in dataloader
|
||
l += logitcrossentropy(model(x), y)
|
||
end
|
||
l/length(dataloader)
|
||
end
|
||
|
||
function accuracy(data_loader, model)
|
||
acc = 0
|
||
for (x,y) in data_loader
|
||
acc += sum(onecold(cpu(model(x))) .== onecold(cpu(y)))*1 / size(x,2)
|
||
end
|
||
acc/length(data_loader)
|
||
end
|
||
|
||
function focal_loss(yh, y)
|
||
ce_loss = Flux.crossentropy(yh, y; agg=identity)
|
||
pt = exp.(-ce_loss)
|
||
gamma = 100
|
||
return mean(((1 .- pt) .^ gamma) .* ce_loss)
|
||
end
|
||
|
||
function train()
|
||
# Initializing Model parameters
|
||
|
||
# Load Data
|
||
train_data,test_data = getdata()
|
||
|
||
# Construct model
|
||
m = build_model()
|
||
train_data = device.(train_data)
|
||
test_data = device.(test_data)
|
||
m = device(m)
|
||
#loss(x,y) = focal_loss(m(x), y)
|
||
loss(x,y) = logitcrossentropy(m(x), y)
|
||
|
||
## Training
|
||
evalcb = () -> @show(loss_all(train_data, m), accuracy(train_data, m))
|
||
opt = ADAM(η)
|
||
|
||
#Flux.@epochs epochs Flux.train!(loss, params(m), train_data, opt, cb = evalcb)
|
||
|
||
#@show accuracy(train_data, m)
|
||
|
||
#@show accuracy(test_data, m)
|
||
end
|
||
|
||
train()
|
||
end
|
||
|
||
# ╔═╡ 5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d
|
||
begin
|
||
using Lathe.preprocess: TrainTestSplit
|
||
using Lathe.lstats: catacc
|
||
dtrain,dtest = TrainTestSplit(training)
|
||
end
|
||
|
||
# ╔═╡ c7ca7334-6bdb-11eb-2413-f50b8cd50f89
|
||
size(training)
|
||
|
||
# ╔═╡ b69d0732-6bd8-11eb-0144-39023b4d5326
|
||
describe(training)
|
||
|
||
# ╔═╡ 964d8b78-6bf6-11eb-1129-9b6e7d2c21d1
|
||
#begin
|
||
# X = permutedims(Array(Float32.(select(training, Not("target")))))#[:, 1:100]# |> gpu
|
||
# Y = permutedims(Array(Float32.(select(training, "target"))))#[:, 1:100]# |> gpu
|
||
# Y = onehotbatch(dropdims(Y; dims = 1), 0:1)
|
||
#m = Chain(
|
||
# Dense(size(X)[1], 32, relu),
|
||
#Dense(32, 2),
|
||
#softmax)# |> gpu
|
||
|
||
# function loss(x, y)
|
||
# return Flux.logitcrossentropy(m(x), y)
|
||
# end
|
||
|
||
# function Accuracy(x, y)
|
||
# a = Flux.onecold(m(x))
|
||
# b = Flux.onecold(y)# |> gpu #### If this is not there, it beceomes a julia array
|
||
# @show(size(a), size(b))
|
||
# return mean(a .== b)
|
||
# end
|
||
|
||
# dataset = Iterators.repeated((X, Y), 10)
|
||
# evalcb = () -> @show(loss(X, Y), Accuracy(X, Y))
|
||
# opt = ADAM(0.001, (0.9, 0.999))
|
||
|
||
# Flux.@epochs 100 Flux.train!(loss, params(m), dataset, opt, cb = Flux.throttle(evalcb, 10))
|
||
|
||
# Accuracy(X, Y), loss(X, Y)
|
||
#end
|
||
|
||
# ╔═╡ 4a55b5fa-6c47-11eb-1b2d-95ae61217361
|
||
begin
|
||
X = Float32.(select(training, Not("target")))
|
||
Y = Float32.(training[!, "target"]) |> Vector
|
||
end
|
||
|
||
# ╔═╡ e250e5da-6c49-11eb-1005-8567cf37eaf8
|
||
begin
|
||
ppca = SKPreprocessor("PCA");
|
||
pnumf = NumFeatureSelector();
|
||
prb = SKPreprocessor("RobustScaler");
|
||
pohePCA = @pipeline pnumf |> prb |> ppca
|
||
trPCA = fit_transform!(pohePCA,X,Y)
|
||
scatter(trPCA[:, 1], trPCA[:, 2], markersize=3*Y.+1)
|
||
end
|
||
|
||
# ╔═╡ e9e72212-6c4b-11eb-0f57-9bd831b4dd80
|
||
begin
|
||
prf = SKLearner("RandomForestClassifier",Dict(:impl_args=>Dict(:n_estimators => 100)))
|
||
rfp1 = @pipeline pnumf |> prb |> prf;
|
||
crossvalidate(rfp1, X,Y)
|
||
end
|
||
|
||
# ╔═╡ 3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1
|
||
begin
|
||
# from discourse discussion with zevelev
|
||
#addprocs()
|
||
#@everywhere using AutoMLPipeline, DataFrames
|
||
|
||
#Get models.
|
||
sk= AutoMLPipeline.SKLearners.learner_dict |> keys |> collect;
|
||
sk= sk |> x-> sort(x,lt=(x,y)->lowercase(x)<lowercase(y));
|
||
m_cl= sk[occursin.("Classifier", sk)];
|
||
m_cl= m_cl ∪ sk[occursin.("NB", sk)];
|
||
m_cl= m_cl ∪ sk[occursin.("SVC", sk)];
|
||
m_cl= m_cl ∪ ["LDA", "QDA"];
|
||
|
||
# find optimal learners
|
||
learners = @distributed (vcat) for m in m_cl
|
||
learner = SKLearner(m)
|
||
pcmc = AutoMLPipeline.@pipeline learner
|
||
println(learner.name)
|
||
mean,sd,folds,err = crossvalidate(pcmc,X,Y,"accuracy_score",5)
|
||
if !isnan(mean)
|
||
DataFrame(name=learner.name,mean=mean,sd=sd,folds=folds,errors=err)
|
||
else
|
||
DataFrame()
|
||
end
|
||
end;
|
||
sort!(learners,:mean,rev=true)
|
||
@show learners;
|
||
|
||
# optimized C
|
||
#results=@distributed (vcat) for C in 1:5
|
||
# @distributed (vcat) for gamma = 1:5
|
||
# svcmodel = SKLearner("SVC",Dict(:impl_args=>Dict(:kernel=>"rbf",:C=>C,:gamma=>gamma) ))
|
||
# mn,sd,fld,err = crossvalidate(svcmodel,X,Y)
|
||
#DataFrame(name=svcmodel.name,mean=mn,sd=sd,C=C,gamma=gamma,folds=fld,errors=err)
|
||
# end
|
||
#end
|
||
#sort!(results,:mean,rev=true)
|
||
#@show results
|
||
|
||
# search best learner by crossvalidation and use it for prediction
|
||
#learners = SKLearner.(["AdaBoostClassifier","BaggingClassifier","SGDClassifier","SVC","LinearSVC"])
|
||
#blearner = BestLearner(learners)
|
||
#crossvalidate(blearner,X,Y,"accuracy_score")
|
||
#fit!(blearner,X,Y)
|
||
end
|
||
|
||
# ╔═╡ 996a1126-6c4d-11eb-34fc-c97f0636695d
|
||
learners[1, :]
|
||
|
||
# ╔═╡ 6eb62f38-6c4f-11eb-0d56-975521d7d224
|
||
begin
|
||
# Add workers
|
||
nprocs() == 1 && addprocs();
|
||
workers()
|
||
|
||
#### feature selectors
|
||
catf = CatFeatureSelector();
|
||
numf = NumFeatureSelector();
|
||
# hot-bit encoder
|
||
ohe = AutoMLPipeline.OneHotEncoder();
|
||
#### feature scalers
|
||
rb = SKPreprocessor("RobustScaler");
|
||
pt = SKPreprocessor("PowerTransformer");
|
||
mx = SKPreprocessor("MinMaxScaler");
|
||
std = SKPreprocessor("StandardScaler");
|
||
norm = SKPreprocessor("Normalizer");
|
||
#### feature extractors
|
||
#pca = SKPreprocessor("PCA", Dict(:autocomponent => true));
|
||
#ica = SKPreprocessor("FastICA", Dict(:autocomponent => true));
|
||
#fa = SKPreprocessor("FactorAnalysis", Dict(:autocomponent => true));
|
||
#### Learners
|
||
rf = SKLearner("RandomForestClassifier", Dict(:impl_args => Dict(:n_estimators => 10)));
|
||
gb = SKLearner("GradientBoostingClassifier");
|
||
lsvc = SKLearner("LinearSVC");
|
||
mlp = SKLearner("MLPClassifier");
|
||
stack = StackEnsemble();
|
||
rbfsvc = SKLearner("SVC");
|
||
ada = SKLearner("AdaBoostClassifier");
|
||
vote = VoteEnsemble();
|
||
best = BestLearner();
|
||
tree = PrunedTree();
|
||
sgd = SKLearner("SGDClassifier");
|
||
noop = Identity(Dict(:name => "Noop"));
|
||
|
||
|
||
# Parallel Search for Datamining Optimal Pipelines
|
||
function prpsearch()
|
||
learners = [rf,ada,sgd,tree,rbfsvc,lsvc,gb];
|
||
scalers = [rb,pt,norm,std,mx,noop];
|
||
dftable = @sync @distributed (vcat) for lr in learners
|
||
@distributed (vcat) for sc in scalers
|
||
pipe = AutoMLPipeline.@pipeline (catf |> ohe) + (numf |> sc ) |> lr
|
||
scn = sc.name[1:end - 4]; lrn = lr.name[1:end - 4]
|
||
pname = "$scn |> $lrn"
|
||
ptime = @elapsed begin
|
||
mean, sd, kfold, _ = crossvalidate(pipe, X, Y, "accuracy_score", 5)
|
||
end
|
||
DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
|
||
end
|
||
end
|
||
sort!(dftable, :mean, rev=true);
|
||
dftable
|
||
end
|
||
runtime = @elapsed begin
|
||
#df = prpsearch()
|
||
end;
|
||
#serialtime = df.time |> sum;
|
||
#(serialtime = "$(round(serialtime / 60.0)) minutes", paralleltime = "$(round(runtime)) seconds")
|
||
|
||
# pipeline performances
|
||
#@show df
|
||
end
|
||
|
||
# ╔═╡ 22b11720-6c50-11eb-3e44-c726d18c6948
|
||
df
|
||
|
||
# ╔═╡ e3d0797a-6c53-11eb-1923-c719d8210533
|
||
gb
|
||
|
||
# ╔═╡ cd229ebc-6c56-11eb-1f44-b3ce2e8e0875
|
||
begin
|
||
jlb = JLBoostClassifier()
|
||
r1 = range(jlb, :nrounds, lower=1, upper = 6)
|
||
r2 = range(jlb, :max_depth, lower=1, upper = 6)
|
||
r3 = range(jlb, :eta, lower=0.1, upper=1.0)
|
||
tm = TunedModel(model = jlb, ranges = [r1, r2, r3], measure = cross_entropy)
|
||
XX = X
|
||
YY = categorical(Bool.(Y))
|
||
m = machine(tm, XX, YY)
|
||
end
|
||
|
||
# ╔═╡ 13522512-6c5a-11eb-2278-2d6bda47e9a1
|
||
parent(YY)
|
||
|
||
# ╔═╡ 331af5c8-6c59-11eb-2dbd-c3a9b8127f97
|
||
MLJ.fit!(m)
|
||
|
||
# ╔═╡ 232e07a2-6c5b-11eb-15e2-2b94b7f934dc
|
||
X
|
||
|
||
# ╔═╡ 24c4b282-6c5b-11eb-3a32-3936b6fde11e
|
||
Y
|
||
|
||
# ╔═╡ 5e8e1a92-6c5b-11eb-3a34-a5d8452288b5
|
||
training
|
||
|
||
# ╔═╡ a98857da-6c5b-11eb-3d4e-75e9ad20b4fc
|
||
dtest
|
||
|
||
# ╔═╡ 834f2260-6c5b-11eb-3577-3d0ffc913da6
|
||
begin
|
||
trainX = (select(dtrain, Not("target")))
|
||
trainy = dtrain[!, "target"]
|
||
testX = (select(dtest, Not("target")))
|
||
testy = dtest[!, "target"]
|
||
end
|
||
|
||
# ╔═╡ dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7
|
||
begin
|
||
model = RandomForestClassifier(trainX, trainy, n_trees = 10, max_depth = 11)
|
||
yhat = model.predict(testX)
|
||
Set(yhat), catacc(yhat, testy)
|
||
end
|
||
|
||
# ╔═╡ eb259a9e-6c5c-11eb-231a-c37636e42302
|
||
begin
|
||
Booster = @load EvoTreeRegressor
|
||
booster = Booster(max_depth=2) # specify hyperparamter at construction
|
||
booster.nrounds=50 # or mutate post facto
|
||
end
|
||
|
||
# ╔═╡ f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc
|
||
|
||
|
||
# ╔═╡ f70c90ee-6c5c-11eb-355e-454a4b2f89d9
|
||
|
||
|
||
# ╔═╡ ffce310e-6c5c-11eb-2025-5f17f3cbed51
|
||
|
||
|
||
# ╔═╡ 091bbe02-6c5d-11eb-2979-75c5cab5b613
|
||
|
||
|
||
# ╔═╡ Cell order:
|
||
# ╠═e8322e0e-6bd2-11eb-3c59-032d9259f937
|
||
# ╠═87c0f780-6c4f-11eb-28d0-3dedf07b9c50
|
||
# ╠═962ac8c4-6be0-11eb-3865-f5dce353b241
|
||
# ╠═75736b1c-6bd5-11eb-08a7-630cd343e6bb
|
||
# ╠═90803e02-6bdd-11eb-0c9e-992ccc13cf4c
|
||
# ╠═50e1ff74-6be2-11eb-0f36-458c037d64fc
|
||
# ╠═29fdbcde-6bda-11eb-2d0e-f737cdf5d019
|
||
# ╠═c7ca7334-6bdb-11eb-2413-f50b8cd50f89
|
||
# ╠═b69d0732-6bd8-11eb-0144-39023b4d5326
|
||
# ╠═964d8b78-6bf6-11eb-1129-9b6e7d2c21d1
|
||
# ╠═4d58899e-6bed-11eb-2a00-674d22246165
|
||
# ╠═4a55b5fa-6c47-11eb-1b2d-95ae61217361
|
||
# ╠═e250e5da-6c49-11eb-1005-8567cf37eaf8
|
||
# ╠═e9e72212-6c4b-11eb-0f57-9bd831b4dd80
|
||
# ╠═3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1
|
||
# ╠═996a1126-6c4d-11eb-34fc-c97f0636695d
|
||
# ╠═6eb62f38-6c4f-11eb-0d56-975521d7d224
|
||
# ╠═22b11720-6c50-11eb-3e44-c726d18c6948
|
||
# ╠═e3d0797a-6c53-11eb-1923-c719d8210533
|
||
# ╠═cd229ebc-6c56-11eb-1f44-b3ce2e8e0875
|
||
# ╠═13522512-6c5a-11eb-2278-2d6bda47e9a1
|
||
# ╠═ce121a34-6c59-11eb-019d-0bf399586199
|
||
# ╠═331af5c8-6c59-11eb-2dbd-c3a9b8127f97
|
||
# ╠═d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8
|
||
# ╠═232e07a2-6c5b-11eb-15e2-2b94b7f934dc
|
||
# ╠═24c4b282-6c5b-11eb-3a32-3936b6fde11e
|
||
# ╠═5e8e1a92-6c5b-11eb-3a34-a5d8452288b5
|
||
# ╠═5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d
|
||
# ╠═a98857da-6c5b-11eb-3d4e-75e9ad20b4fc
|
||
# ╠═834f2260-6c5b-11eb-3577-3d0ffc913da6
|
||
# ╠═dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7
|
||
# ╠═eb259a9e-6c5c-11eb-231a-c37636e42302
|
||
# ╠═f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc
|
||
# ╠═f70c90ee-6c5c-11eb-355e-454a4b2f89d9
|
||
# ╠═ffce310e-6c5c-11eb-2025-5f17f3cbed51
|
||
# ╠═091bbe02-6c5d-11eb-2979-75c5cab5b613
|