axa_assignment/notebook.jl
Hugo Schindler b17ed9ef5a First commit
2021-02-11 17:11:45 +01:00

489 lines
14 KiB
Julia
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

### A Pluto.jl notebook ###
# v0.12.20
using Markdown
using InteractiveUtils
# ╔═╡ e8322e0e-6bd2-11eb-3c59-032d9259f937
begin
using Dates
using CSV, Statistics
#using DataFrames
using Plots
#using AutoMLPipeline
using Flux
using CUDA
using MultivariateStats
using Distributed
using ScikitLearn
using JLBoost, JLBoostMLJ, MLJ
end
# ╔═╡ 87c0f780-6c4f-11eb-28d0-3dedf07b9c50
begin
@sync @everywhere using AutoMLPipeline
@sync @everywhere using DataFrames
end
# ╔═╡ ce121a34-6c59-11eb-019d-0bf399586199
begin
using RDatasets;
iris = dataset("datasets", "iris");
iris[!, :is_setosa] = iris.Species .== "setosa";
Xi, yi = unpack(iris, x->!(x in [:is_setosa, :Species]), ==(:is_setosa));
end
# ╔═╡ d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8
begin
using Lathe.models: RandomForestClassifier
using StatsBase
end
# ╔═╡ 962ac8c4-6be0-11eb-3865-f5dce353b241
splitLabel(label) = split(label, " ")[1]
# ╔═╡ 75736b1c-6bd5-11eb-08a7-630cd343e6bb
function onehot_df(training::DataFrame, key::String, insert::Bool)
if !insert
return select(training, Not(key))
end
labels = sort(unique(training[!, key]))
oh_key = Flux.onehotbatch(training[!, key], labels)
return hcat(select(training, Not(key)), DataFrame(oh_key', (key[1:4]*"_") .* splitLabel.(labels)))
end
# ╔═╡ 90803e02-6bdd-11eb-0c9e-992ccc13cf4c
dateConv(d) = Dates.DateTime(d, "yyyy-mm-dd HH:MM:SS")
# ╔═╡ 50e1ff74-6be2-11eb-0f36-458c037d64fc
convertDay(array) = Dates.value.(convert.(Dates.Day, round.(array, Dates.Day)))
# ╔═╡ 29fdbcde-6bda-11eb-2d0e-f737cdf5d019
begin
training = DataFrame(CSV.File("training_SC_GGP_AXA_FR.csv"))
# Date conversion
# Missing
today = Dates.now()
today_string = Dates.format(today, "yyyy-mm-dd HH:MM:SS")
replace!(training.ouverture_dernier_sinistre, missing => today_string)
replace!(training.cloture_dernier_sinistre, missing => today_string)
for key in ["debut_contrat", "ouverture_dernier_sinistre", "cloture_dernier_sinistre"]
training[!, key] = dateConv.(training[!, key])
end
# Add duree
insertcols!(training, "duree_contrat" => convertDay(today - training.debut_contrat))
insertcols!(training, "duree_dernier_sinistre" => convertDay(training.cloture_dernier_sinistre - training.ouverture_dernier_sinistre))
insertcols!(training, "duree_zero_sinistre" => convertDay(today - training.cloture_dernier_sinistre))
select!(training, Not("debut_contrat"))
select!(training, Not("ouverture_dernier_sinistre"))
select!(training, Not("cloture_dernier_sinistre"))
# One hot encode categorical
training = onehot_df(training, "departement", true)
training = onehot_df(training, "categorie_socio_professionnelle", true)
training = onehot_df(training, "type_de_bien", true)
training = onehot_df(training, "statut", true)
select!(training, Not("index"))
training = Float32.(training)
end
# ╔═╡ 4d58899e-6bed-11eb-2a00-674d22246165
begin
using Flux.Data: DataLoader
using Flux: onehotbatch, onecold, logitcrossentropy, throttle, @epochs
using Base.Iterators: repeated
using Parameters: @with_kw
using MLDatasets
if has_cuda() # Check if CUDA is available
@info "CUDA is on"
CUDA.allowscalar(false)
end
η = 1e-5 # learning rate
batchsize = 1000 # batch size
epochs = 1000 # number of epochs
device = cpu # set as gpu, if gpu available
function getdata()
ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
# Loading Dataset
xtrain = permutedims(Array(Float32.(select(training, Not("target")))))
ytrain = permutedims(Array(Float32.(select(training, "target"))))
xtest, ytest = xtrain, ytrain
#xtrain, ytrain = MLDatasets.MNIST.traindata(Float32)
#xtest, ytest = MLDatasets.MNIST.testdata(Float32)
@show(size(xtrain), size(ytrain))
@show(typeof(xtrain), typeof(ytrain))
# Reshape Data in order to flatten each image into a linear array
#xtrain = Flux.flatten(xtrain)
#xtest = Flux.flatten(xtest)
ytrain = dropdims(ytrain; dims = 1)
ytest = dropdims(ytest; dims = 1)
@show(size(xtrain), size(ytrain))
@show(typeof(xtrain), typeof(ytrain))
# One-hot-encode the labels
ytrain, ytest = onehotbatch(ytrain, 0:1), onehotbatch(ytest, 0:1)
#ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9)
@show(size(xtrain), size(ytrain))
@show(typeof(xtrain), typeof(ytrain))
# Batching
train_data = DataLoader(xtrain, ytrain, batchsize=batchsize, shuffle=true)
test_data = DataLoader(xtest, ytest, batchsize=batchsize)
return train_data, test_data
end
function build_model(; imgsize=(20,1), nclasses=2, hidden=1024)
return Chain(
Dense(prod(imgsize), hidden, relu),
Dense(hidden, hidden, relu),
Dense(hidden, nclasses))
end
function loss_all(dataloader, model)
l = 0f0
for (x,y) in dataloader
l += logitcrossentropy(model(x), y)
end
l/length(dataloader)
end
function accuracy(data_loader, model)
acc = 0
for (x,y) in data_loader
acc += sum(onecold(cpu(model(x))) .== onecold(cpu(y)))*1 / size(x,2)
end
acc/length(data_loader)
end
function focal_loss(yh, y)
ce_loss = Flux.crossentropy(yh, y; agg=identity)
pt = exp.(-ce_loss)
gamma = 100
return mean(((1 .- pt) .^ gamma) .* ce_loss)
end
function train()
# Initializing Model parameters
# Load Data
train_data,test_data = getdata()
# Construct model
m = build_model()
train_data = device.(train_data)
test_data = device.(test_data)
m = device(m)
#loss(x,y) = focal_loss(m(x), y)
loss(x,y) = logitcrossentropy(m(x), y)
## Training
evalcb = () -> @show(loss_all(train_data, m), accuracy(train_data, m))
opt = ADAM(η)
#Flux.@epochs epochs Flux.train!(loss, params(m), train_data, opt, cb = evalcb)
#@show accuracy(train_data, m)
#@show accuracy(test_data, m)
end
train()
end
# ╔═╡ 5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d
begin
using Lathe.preprocess: TrainTestSplit
using Lathe.lstats: catacc
dtrain,dtest = TrainTestSplit(training)
end
# ╔═╡ c7ca7334-6bdb-11eb-2413-f50b8cd50f89
size(training)
# ╔═╡ b69d0732-6bd8-11eb-0144-39023b4d5326
describe(training)
# ╔═╡ 964d8b78-6bf6-11eb-1129-9b6e7d2c21d1
#begin
# X = permutedims(Array(Float32.(select(training, Not("target")))))#[:, 1:100]# |> gpu
# Y = permutedims(Array(Float32.(select(training, "target"))))#[:, 1:100]# |> gpu
# Y = onehotbatch(dropdims(Y; dims = 1), 0:1)
#m = Chain(
# Dense(size(X)[1], 32, relu),
#Dense(32, 2),
#softmax)# |> gpu
# function loss(x, y)
# return Flux.logitcrossentropy(m(x), y)
# end
# function Accuracy(x, y)
# a = Flux.onecold(m(x))
# b = Flux.onecold(y)# |> gpu #### If this is not there, it beceomes a julia array
# @show(size(a), size(b))
# return mean(a .== b)
# end
# dataset = Iterators.repeated((X, Y), 10)
# evalcb = () -> @show(loss(X, Y), Accuracy(X, Y))
# opt = ADAM(0.001, (0.9, 0.999))
# Flux.@epochs 100 Flux.train!(loss, params(m), dataset, opt, cb = Flux.throttle(evalcb, 10))
# Accuracy(X, Y), loss(X, Y)
#end
# ╔═╡ 4a55b5fa-6c47-11eb-1b2d-95ae61217361
begin
X = Float32.(select(training, Not("target")))
Y = Float32.(training[!, "target"]) |> Vector
end
# ╔═╡ e250e5da-6c49-11eb-1005-8567cf37eaf8
begin
ppca = SKPreprocessor("PCA");
pnumf = NumFeatureSelector();
prb = SKPreprocessor("RobustScaler");
pohePCA = @pipeline pnumf |> prb |> ppca
trPCA = fit_transform!(pohePCA,X,Y)
scatter(trPCA[:, 1], trPCA[:, 2], markersize=3*Y.+1)
end
# ╔═╡ e9e72212-6c4b-11eb-0f57-9bd831b4dd80
begin
prf = SKLearner("RandomForestClassifier",Dict(:impl_args=>Dict(:n_estimators => 100)))
rfp1 = @pipeline pnumf |> prb |> prf;
crossvalidate(rfp1, X,Y)
end
# ╔═╡ 3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1
begin
# from discourse discussion with zevelev
#addprocs()
#@everywhere using AutoMLPipeline, DataFrames
#Get models.
sk= AutoMLPipeline.SKLearners.learner_dict |> keys |> collect;
sk= sk |> x-> sort(x,lt=(x,y)->lowercase(x)<lowercase(y));
m_cl= sk[occursin.("Classifier", sk)];
m_cl= m_cl sk[occursin.("NB", sk)];
m_cl= m_cl sk[occursin.("SVC", sk)];
m_cl= m_cl ["LDA", "QDA"];
# find optimal learners
learners = @distributed (vcat) for m in m_cl
learner = SKLearner(m)
pcmc = AutoMLPipeline.@pipeline learner
println(learner.name)
mean,sd,folds,err = crossvalidate(pcmc,X,Y,"accuracy_score",5)
if !isnan(mean)
DataFrame(name=learner.name,mean=mean,sd=sd,folds=folds,errors=err)
else
DataFrame()
end
end;
sort!(learners,:mean,rev=true)
@show learners;
# optimized C
#results=@distributed (vcat) for C in 1:5
# @distributed (vcat) for gamma = 1:5
# svcmodel = SKLearner("SVC",Dict(:impl_args=>Dict(:kernel=>"rbf",:C=>C,:gamma=>gamma) ))
# mn,sd,fld,err = crossvalidate(svcmodel,X,Y)
#DataFrame(name=svcmodel.name,mean=mn,sd=sd,C=C,gamma=gamma,folds=fld,errors=err)
# end
#end
#sort!(results,:mean,rev=true)
#@show results
# search best learner by crossvalidation and use it for prediction
#learners = SKLearner.(["AdaBoostClassifier","BaggingClassifier","SGDClassifier","SVC","LinearSVC"])
#blearner = BestLearner(learners)
#crossvalidate(blearner,X,Y,"accuracy_score")
#fit!(blearner,X,Y)
end
# ╔═╡ 996a1126-6c4d-11eb-34fc-c97f0636695d
learners[1, :]
# ╔═╡ 6eb62f38-6c4f-11eb-0d56-975521d7d224
begin
# Add workers
nprocs() == 1 && addprocs();
workers()
#### feature selectors
catf = CatFeatureSelector();
numf = NumFeatureSelector();
# hot-bit encoder
ohe = AutoMLPipeline.OneHotEncoder();
#### feature scalers
rb = SKPreprocessor("RobustScaler");
pt = SKPreprocessor("PowerTransformer");
mx = SKPreprocessor("MinMaxScaler");
std = SKPreprocessor("StandardScaler");
norm = SKPreprocessor("Normalizer");
#### feature extractors
#pca = SKPreprocessor("PCA", Dict(:autocomponent => true));
#ica = SKPreprocessor("FastICA", Dict(:autocomponent => true));
#fa = SKPreprocessor("FactorAnalysis", Dict(:autocomponent => true));
#### Learners
rf = SKLearner("RandomForestClassifier", Dict(:impl_args => Dict(:n_estimators => 10)));
gb = SKLearner("GradientBoostingClassifier");
lsvc = SKLearner("LinearSVC");
mlp = SKLearner("MLPClassifier");
stack = StackEnsemble();
rbfsvc = SKLearner("SVC");
ada = SKLearner("AdaBoostClassifier");
vote = VoteEnsemble();
best = BestLearner();
tree = PrunedTree();
sgd = SKLearner("SGDClassifier");
noop = Identity(Dict(:name => "Noop"));
# Parallel Search for Datamining Optimal Pipelines
function prpsearch()
learners = [rf,ada,sgd,tree,rbfsvc,lsvc,gb];
scalers = [rb,pt,norm,std,mx,noop];
dftable = @sync @distributed (vcat) for lr in learners
@distributed (vcat) for sc in scalers
pipe = AutoMLPipeline.@pipeline (catf |> ohe) + (numf |> sc ) |> lr
scn = sc.name[1:end - 4]; lrn = lr.name[1:end - 4]
pname = "$scn |> $lrn"
ptime = @elapsed begin
mean, sd, kfold, _ = crossvalidate(pipe, X, Y, "accuracy_score", 5)
end
DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold)
end
end
sort!(dftable, :mean, rev=true);
dftable
end
runtime = @elapsed begin
#df = prpsearch()
end;
#serialtime = df.time |> sum;
#(serialtime = "$(round(serialtime / 60.0)) minutes", paralleltime = "$(round(runtime)) seconds")
# pipeline performances
#@show df
end
# ╔═╡ 22b11720-6c50-11eb-3e44-c726d18c6948
df
# ╔═╡ e3d0797a-6c53-11eb-1923-c719d8210533
gb
# ╔═╡ cd229ebc-6c56-11eb-1f44-b3ce2e8e0875
begin
jlb = JLBoostClassifier()
r1 = range(jlb, :nrounds, lower=1, upper = 6)
r2 = range(jlb, :max_depth, lower=1, upper = 6)
r3 = range(jlb, :eta, lower=0.1, upper=1.0)
tm = TunedModel(model = jlb, ranges = [r1, r2, r3], measure = cross_entropy)
XX = X
YY = categorical(Bool.(Y))
m = machine(tm, XX, YY)
end
# ╔═╡ 13522512-6c5a-11eb-2278-2d6bda47e9a1
parent(YY)
# ╔═╡ 331af5c8-6c59-11eb-2dbd-c3a9b8127f97
MLJ.fit!(m)
# ╔═╡ 232e07a2-6c5b-11eb-15e2-2b94b7f934dc
X
# ╔═╡ 24c4b282-6c5b-11eb-3a32-3936b6fde11e
Y
# ╔═╡ 5e8e1a92-6c5b-11eb-3a34-a5d8452288b5
training
# ╔═╡ a98857da-6c5b-11eb-3d4e-75e9ad20b4fc
dtest
# ╔═╡ 834f2260-6c5b-11eb-3577-3d0ffc913da6
begin
trainX = (select(dtrain, Not("target")))
trainy = dtrain[!, "target"]
testX = (select(dtest, Not("target")))
testy = dtest[!, "target"]
end
# ╔═╡ dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7
begin
model = RandomForestClassifier(trainX, trainy, n_trees = 10, max_depth = 11)
yhat = model.predict(testX)
Set(yhat), catacc(yhat, testy)
end
# ╔═╡ eb259a9e-6c5c-11eb-231a-c37636e42302
begin
Booster = @load EvoTreeRegressor
booster = Booster(max_depth=2) # specify hyperparamter at construction
booster.nrounds=50 # or mutate post facto
end
# ╔═╡ f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc
# ╔═╡ f70c90ee-6c5c-11eb-355e-454a4b2f89d9
# ╔═╡ ffce310e-6c5c-11eb-2025-5f17f3cbed51
# ╔═╡ 091bbe02-6c5d-11eb-2979-75c5cab5b613
# ╔═╡ Cell order:
# ╠═e8322e0e-6bd2-11eb-3c59-032d9259f937
# ╠═87c0f780-6c4f-11eb-28d0-3dedf07b9c50
# ╠═962ac8c4-6be0-11eb-3865-f5dce353b241
# ╠═75736b1c-6bd5-11eb-08a7-630cd343e6bb
# ╠═90803e02-6bdd-11eb-0c9e-992ccc13cf4c
# ╠═50e1ff74-6be2-11eb-0f36-458c037d64fc
# ╠═29fdbcde-6bda-11eb-2d0e-f737cdf5d019
# ╠═c7ca7334-6bdb-11eb-2413-f50b8cd50f89
# ╠═b69d0732-6bd8-11eb-0144-39023b4d5326
# ╠═964d8b78-6bf6-11eb-1129-9b6e7d2c21d1
# ╠═4d58899e-6bed-11eb-2a00-674d22246165
# ╠═4a55b5fa-6c47-11eb-1b2d-95ae61217361
# ╠═e250e5da-6c49-11eb-1005-8567cf37eaf8
# ╠═e9e72212-6c4b-11eb-0f57-9bd831b4dd80
# ╠═3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1
# ╠═996a1126-6c4d-11eb-34fc-c97f0636695d
# ╠═6eb62f38-6c4f-11eb-0d56-975521d7d224
# ╠═22b11720-6c50-11eb-3e44-c726d18c6948
# ╠═e3d0797a-6c53-11eb-1923-c719d8210533
# ╠═cd229ebc-6c56-11eb-1f44-b3ce2e8e0875
# ╠═13522512-6c5a-11eb-2278-2d6bda47e9a1
# ╠═ce121a34-6c59-11eb-019d-0bf399586199
# ╠═331af5c8-6c59-11eb-2dbd-c3a9b8127f97
# ╠═d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8
# ╠═232e07a2-6c5b-11eb-15e2-2b94b7f934dc
# ╠═24c4b282-6c5b-11eb-3a32-3936b6fde11e
# ╠═5e8e1a92-6c5b-11eb-3a34-a5d8452288b5
# ╠═5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d
# ╠═a98857da-6c5b-11eb-3d4e-75e9ad20b4fc
# ╠═834f2260-6c5b-11eb-3577-3d0ffc913da6
# ╠═dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7
# ╠═eb259a9e-6c5c-11eb-231a-c37636e42302
# ╠═f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc
# ╠═f70c90ee-6c5c-11eb-355e-454a4b2f89d9
# ╠═ffce310e-6c5c-11eb-2025-5f17f3cbed51
# ╠═091bbe02-6c5d-11eb-2979-75c5cab5b613