### A Pluto.jl notebook ### # v0.12.20 using Markdown using InteractiveUtils # ╔═╡ e8322e0e-6bd2-11eb-3c59-032d9259f937 begin using Dates using CSV, Statistics #using DataFrames using Plots #using AutoMLPipeline using Flux using CUDA using MultivariateStats using Distributed using ScikitLearn using JLBoost, JLBoostMLJ, MLJ end # ╔═╡ 87c0f780-6c4f-11eb-28d0-3dedf07b9c50 begin @sync @everywhere using AutoMLPipeline @sync @everywhere using DataFrames end # ╔═╡ ce121a34-6c59-11eb-019d-0bf399586199 begin using RDatasets; iris = dataset("datasets", "iris"); iris[!, :is_setosa] = iris.Species .== "setosa"; Xi, yi = unpack(iris, x->!(x in [:is_setosa, :Species]), ==(:is_setosa)); end # ╔═╡ d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8 begin using Lathe.models: RandomForestClassifier using StatsBase end # ╔═╡ 962ac8c4-6be0-11eb-3865-f5dce353b241 splitLabel(label) = split(label, " ")[1] # ╔═╡ 75736b1c-6bd5-11eb-08a7-630cd343e6bb function onehot_df(training::DataFrame, key::String, insert::Bool) if !insert return select(training, Not(key)) end labels = sort(unique(training[!, key])) oh_key = Flux.onehotbatch(training[!, key], labels) return hcat(select(training, Not(key)), DataFrame(oh_key', (key[1:4]*"_") .* splitLabel.(labels))) end # ╔═╡ 90803e02-6bdd-11eb-0c9e-992ccc13cf4c dateConv(d) = Dates.DateTime(d, "yyyy-mm-dd HH:MM:SS") # ╔═╡ 50e1ff74-6be2-11eb-0f36-458c037d64fc convertDay(array) = Dates.value.(convert.(Dates.Day, round.(array, Dates.Day))) # ╔═╡ 29fdbcde-6bda-11eb-2d0e-f737cdf5d019 begin training = DataFrame(CSV.File("training_SC_GGP_AXA_FR.csv")) # Date conversion # Missing today = Dates.now() today_string = Dates.format(today, "yyyy-mm-dd HH:MM:SS") replace!(training.ouverture_dernier_sinistre, missing => today_string) replace!(training.cloture_dernier_sinistre, missing => today_string) for key in ["debut_contrat", "ouverture_dernier_sinistre", "cloture_dernier_sinistre"] training[!, key] = dateConv.(training[!, key]) end # Add duree insertcols!(training, "duree_contrat" => convertDay(today - training.debut_contrat)) insertcols!(training, "duree_dernier_sinistre" => convertDay(training.cloture_dernier_sinistre - training.ouverture_dernier_sinistre)) insertcols!(training, "duree_zero_sinistre" => convertDay(today - training.cloture_dernier_sinistre)) select!(training, Not("debut_contrat")) select!(training, Not("ouverture_dernier_sinistre")) select!(training, Not("cloture_dernier_sinistre")) # One hot encode categorical training = onehot_df(training, "departement", true) training = onehot_df(training, "categorie_socio_professionnelle", true) training = onehot_df(training, "type_de_bien", true) training = onehot_df(training, "statut", true) select!(training, Not("index")) training = Float32.(training) end # ╔═╡ 4d58899e-6bed-11eb-2a00-674d22246165 begin using Flux.Data: DataLoader using Flux: onehotbatch, onecold, logitcrossentropy, throttle, @epochs using Base.Iterators: repeated using Parameters: @with_kw using MLDatasets if has_cuda() # Check if CUDA is available @info "CUDA is on" CUDA.allowscalar(false) end η = 1e-5 # learning rate batchsize = 1000 # batch size epochs = 1000 # number of epochs device = cpu # set as gpu, if gpu available function getdata() ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # Loading Dataset xtrain = permutedims(Array(Float32.(select(training, Not("target"))))) ytrain = permutedims(Array(Float32.(select(training, "target")))) xtest, ytest = xtrain, ytrain #xtrain, ytrain = MLDatasets.MNIST.traindata(Float32) #xtest, ytest = MLDatasets.MNIST.testdata(Float32) @show(size(xtrain), size(ytrain)) @show(typeof(xtrain), typeof(ytrain)) # Reshape Data in order to flatten each image into a linear array #xtrain = Flux.flatten(xtrain) #xtest = Flux.flatten(xtest) ytrain = dropdims(ytrain; dims = 1) ytest = dropdims(ytest; dims = 1) @show(size(xtrain), size(ytrain)) @show(typeof(xtrain), typeof(ytrain)) # One-hot-encode the labels ytrain, ytest = onehotbatch(ytrain, 0:1), onehotbatch(ytest, 0:1) #ytrain, ytest = onehotbatch(ytrain, 0:9), onehotbatch(ytest, 0:9) @show(size(xtrain), size(ytrain)) @show(typeof(xtrain), typeof(ytrain)) # Batching train_data = DataLoader(xtrain, ytrain, batchsize=batchsize, shuffle=true) test_data = DataLoader(xtest, ytest, batchsize=batchsize) return train_data, test_data end function build_model(; imgsize=(20,1), nclasses=2, hidden=1024) return Chain( Dense(prod(imgsize), hidden, relu), Dense(hidden, hidden, relu), Dense(hidden, nclasses)) end function loss_all(dataloader, model) l = 0f0 for (x,y) in dataloader l += logitcrossentropy(model(x), y) end l/length(dataloader) end function accuracy(data_loader, model) acc = 0 for (x,y) in data_loader acc += sum(onecold(cpu(model(x))) .== onecold(cpu(y)))*1 / size(x,2) end acc/length(data_loader) end function focal_loss(yh, y) ce_loss = Flux.crossentropy(yh, y; agg=identity) pt = exp.(-ce_loss) gamma = 100 return mean(((1 .- pt) .^ gamma) .* ce_loss) end function train() # Initializing Model parameters # Load Data train_data,test_data = getdata() # Construct model m = build_model() train_data = device.(train_data) test_data = device.(test_data) m = device(m) #loss(x,y) = focal_loss(m(x), y) loss(x,y) = logitcrossentropy(m(x), y) ## Training evalcb = () -> @show(loss_all(train_data, m), accuracy(train_data, m)) opt = ADAM(η) #Flux.@epochs epochs Flux.train!(loss, params(m), train_data, opt, cb = evalcb) #@show accuracy(train_data, m) #@show accuracy(test_data, m) end train() end # ╔═╡ 5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d begin using Lathe.preprocess: TrainTestSplit using Lathe.lstats: catacc dtrain,dtest = TrainTestSplit(training) end # ╔═╡ c7ca7334-6bdb-11eb-2413-f50b8cd50f89 size(training) # ╔═╡ b69d0732-6bd8-11eb-0144-39023b4d5326 describe(training) # ╔═╡ 964d8b78-6bf6-11eb-1129-9b6e7d2c21d1 #begin # X = permutedims(Array(Float32.(select(training, Not("target")))))#[:, 1:100]# |> gpu # Y = permutedims(Array(Float32.(select(training, "target"))))#[:, 1:100]# |> gpu # Y = onehotbatch(dropdims(Y; dims = 1), 0:1) #m = Chain( # Dense(size(X)[1], 32, relu), #Dense(32, 2), #softmax)# |> gpu # function loss(x, y) # return Flux.logitcrossentropy(m(x), y) # end # function Accuracy(x, y) # a = Flux.onecold(m(x)) # b = Flux.onecold(y)# |> gpu #### If this is not there, it beceomes a julia array # @show(size(a), size(b)) # return mean(a .== b) # end # dataset = Iterators.repeated((X, Y), 10) # evalcb = () -> @show(loss(X, Y), Accuracy(X, Y)) # opt = ADAM(0.001, (0.9, 0.999)) # Flux.@epochs 100 Flux.train!(loss, params(m), dataset, opt, cb = Flux.throttle(evalcb, 10)) # Accuracy(X, Y), loss(X, Y) #end # ╔═╡ 4a55b5fa-6c47-11eb-1b2d-95ae61217361 begin X = Float32.(select(training, Not("target"))) Y = Float32.(training[!, "target"]) |> Vector end # ╔═╡ e250e5da-6c49-11eb-1005-8567cf37eaf8 begin ppca = SKPreprocessor("PCA"); pnumf = NumFeatureSelector(); prb = SKPreprocessor("RobustScaler"); pohePCA = @pipeline pnumf |> prb |> ppca trPCA = fit_transform!(pohePCA,X,Y) scatter(trPCA[:, 1], trPCA[:, 2], markersize=3*Y.+1) end # ╔═╡ e9e72212-6c4b-11eb-0f57-9bd831b4dd80 begin prf = SKLearner("RandomForestClassifier",Dict(:impl_args=>Dict(:n_estimators => 100))) rfp1 = @pipeline pnumf |> prb |> prf; crossvalidate(rfp1, X,Y) end # ╔═╡ 3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1 begin # from discourse discussion with zevelev #addprocs() #@everywhere using AutoMLPipeline, DataFrames #Get models. sk= AutoMLPipeline.SKLearners.learner_dict |> keys |> collect; sk= sk |> x-> sort(x,lt=(x,y)->lowercase(x)Dict(:kernel=>"rbf",:C=>C,:gamma=>gamma) )) # mn,sd,fld,err = crossvalidate(svcmodel,X,Y) #DataFrame(name=svcmodel.name,mean=mn,sd=sd,C=C,gamma=gamma,folds=fld,errors=err) # end #end #sort!(results,:mean,rev=true) #@show results # search best learner by crossvalidation and use it for prediction #learners = SKLearner.(["AdaBoostClassifier","BaggingClassifier","SGDClassifier","SVC","LinearSVC"]) #blearner = BestLearner(learners) #crossvalidate(blearner,X,Y,"accuracy_score") #fit!(blearner,X,Y) end # ╔═╡ 996a1126-6c4d-11eb-34fc-c97f0636695d learners[1, :] # ╔═╡ 6eb62f38-6c4f-11eb-0d56-975521d7d224 begin # Add workers nprocs() == 1 && addprocs(); workers() #### feature selectors catf = CatFeatureSelector(); numf = NumFeatureSelector(); # hot-bit encoder ohe = AutoMLPipeline.OneHotEncoder(); #### feature scalers rb = SKPreprocessor("RobustScaler"); pt = SKPreprocessor("PowerTransformer"); mx = SKPreprocessor("MinMaxScaler"); std = SKPreprocessor("StandardScaler"); norm = SKPreprocessor("Normalizer"); #### feature extractors #pca = SKPreprocessor("PCA", Dict(:autocomponent => true)); #ica = SKPreprocessor("FastICA", Dict(:autocomponent => true)); #fa = SKPreprocessor("FactorAnalysis", Dict(:autocomponent => true)); #### Learners rf = SKLearner("RandomForestClassifier", Dict(:impl_args => Dict(:n_estimators => 10))); gb = SKLearner("GradientBoostingClassifier"); lsvc = SKLearner("LinearSVC"); mlp = SKLearner("MLPClassifier"); stack = StackEnsemble(); rbfsvc = SKLearner("SVC"); ada = SKLearner("AdaBoostClassifier"); vote = VoteEnsemble(); best = BestLearner(); tree = PrunedTree(); sgd = SKLearner("SGDClassifier"); noop = Identity(Dict(:name => "Noop")); # Parallel Search for Datamining Optimal Pipelines function prpsearch() learners = [rf,ada,sgd,tree,rbfsvc,lsvc,gb]; scalers = [rb,pt,norm,std,mx,noop]; dftable = @sync @distributed (vcat) for lr in learners @distributed (vcat) for sc in scalers pipe = AutoMLPipeline.@pipeline (catf |> ohe) + (numf |> sc ) |> lr scn = sc.name[1:end - 4]; lrn = lr.name[1:end - 4] pname = "$scn |> $lrn" ptime = @elapsed begin mean, sd, kfold, _ = crossvalidate(pipe, X, Y, "accuracy_score", 5) end DataFrame(pipeline=pname, mean=mean, sd=sd, time=ptime, folds=kfold) end end sort!(dftable, :mean, rev=true); dftable end runtime = @elapsed begin #df = prpsearch() end; #serialtime = df.time |> sum; #(serialtime = "$(round(serialtime / 60.0)) minutes", paralleltime = "$(round(runtime)) seconds") # pipeline performances #@show df end # ╔═╡ 22b11720-6c50-11eb-3e44-c726d18c6948 df # ╔═╡ e3d0797a-6c53-11eb-1923-c719d8210533 gb # ╔═╡ cd229ebc-6c56-11eb-1f44-b3ce2e8e0875 begin jlb = JLBoostClassifier() r1 = range(jlb, :nrounds, lower=1, upper = 6) r2 = range(jlb, :max_depth, lower=1, upper = 6) r3 = range(jlb, :eta, lower=0.1, upper=1.0) tm = TunedModel(model = jlb, ranges = [r1, r2, r3], measure = cross_entropy) XX = X YY = categorical(Bool.(Y)) m = machine(tm, XX, YY) end # ╔═╡ 13522512-6c5a-11eb-2278-2d6bda47e9a1 parent(YY) # ╔═╡ 331af5c8-6c59-11eb-2dbd-c3a9b8127f97 MLJ.fit!(m) # ╔═╡ 232e07a2-6c5b-11eb-15e2-2b94b7f934dc X # ╔═╡ 24c4b282-6c5b-11eb-3a32-3936b6fde11e Y # ╔═╡ 5e8e1a92-6c5b-11eb-3a34-a5d8452288b5 training # ╔═╡ a98857da-6c5b-11eb-3d4e-75e9ad20b4fc dtest # ╔═╡ 834f2260-6c5b-11eb-3577-3d0ffc913da6 begin trainX = (select(dtrain, Not("target"))) trainy = dtrain[!, "target"] testX = (select(dtest, Not("target"))) testy = dtest[!, "target"] end # ╔═╡ dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7 begin model = RandomForestClassifier(trainX, trainy, n_trees = 10, max_depth = 11) yhat = model.predict(testX) Set(yhat), catacc(yhat, testy) end # ╔═╡ eb259a9e-6c5c-11eb-231a-c37636e42302 begin Booster = @load EvoTreeRegressor booster = Booster(max_depth=2) # specify hyperparamter at construction booster.nrounds=50 # or mutate post facto end # ╔═╡ f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc # ╔═╡ f70c90ee-6c5c-11eb-355e-454a4b2f89d9 # ╔═╡ ffce310e-6c5c-11eb-2025-5f17f3cbed51 # ╔═╡ 091bbe02-6c5d-11eb-2979-75c5cab5b613 # ╔═╡ Cell order: # ╠═e8322e0e-6bd2-11eb-3c59-032d9259f937 # ╠═87c0f780-6c4f-11eb-28d0-3dedf07b9c50 # ╠═962ac8c4-6be0-11eb-3865-f5dce353b241 # ╠═75736b1c-6bd5-11eb-08a7-630cd343e6bb # ╠═90803e02-6bdd-11eb-0c9e-992ccc13cf4c # ╠═50e1ff74-6be2-11eb-0f36-458c037d64fc # ╠═29fdbcde-6bda-11eb-2d0e-f737cdf5d019 # ╠═c7ca7334-6bdb-11eb-2413-f50b8cd50f89 # ╠═b69d0732-6bd8-11eb-0144-39023b4d5326 # ╠═964d8b78-6bf6-11eb-1129-9b6e7d2c21d1 # ╠═4d58899e-6bed-11eb-2a00-674d22246165 # ╠═4a55b5fa-6c47-11eb-1b2d-95ae61217361 # ╠═e250e5da-6c49-11eb-1005-8567cf37eaf8 # ╠═e9e72212-6c4b-11eb-0f57-9bd831b4dd80 # ╠═3c0a1b1c-6c4c-11eb-1cb8-bb4182cf5dd1 # ╠═996a1126-6c4d-11eb-34fc-c97f0636695d # ╠═6eb62f38-6c4f-11eb-0d56-975521d7d224 # ╠═22b11720-6c50-11eb-3e44-c726d18c6948 # ╠═e3d0797a-6c53-11eb-1923-c719d8210533 # ╠═cd229ebc-6c56-11eb-1f44-b3ce2e8e0875 # ╠═13522512-6c5a-11eb-2278-2d6bda47e9a1 # ╠═ce121a34-6c59-11eb-019d-0bf399586199 # ╠═331af5c8-6c59-11eb-2dbd-c3a9b8127f97 # ╠═d0e9a9d8-6c5a-11eb-059d-a51a5d6aa7e8 # ╠═232e07a2-6c5b-11eb-15e2-2b94b7f934dc # ╠═24c4b282-6c5b-11eb-3a32-3936b6fde11e # ╠═5e8e1a92-6c5b-11eb-3a34-a5d8452288b5 # ╠═5ccf4fc8-6c5b-11eb-1aaa-876ea467c25d # ╠═a98857da-6c5b-11eb-3d4e-75e9ad20b4fc # ╠═834f2260-6c5b-11eb-3577-3d0ffc913da6 # ╠═dcb1ae36-6c5b-11eb-2e8c-8ff8c85b46c7 # ╠═eb259a9e-6c5c-11eb-231a-c37636e42302 # ╠═f24fea0e-6c5c-11eb-0c7a-8f2bd3550ecc # ╠═f70c90ee-6c5c-11eb-355e-454a4b2f89d9 # ╠═ffce310e-6c5c-11eb-2025-5f17f3cbed51 # ╠═091bbe02-6c5d-11eb-2979-75c5cab5b613