### A Pluto.jl notebook ### # v0.12.20 using Markdown using InteractiveUtils # ╔═╡ 21895968-6c5d-11eb-1800-cd0f921c6d83 begin using DataFrames, CSV using Dates using Flux end # ╔═╡ 326f6ec0-6c5d-11eb-105a-ff2f251973c2 splitLabel(label) = split(label, " ")[1] # ╔═╡ 3771c65c-6c5d-11eb-24d7-39b38b5bfde6 function onehot_df(training::DataFrame, key::String, labels::Array, insert::Bool) if !insert return select(training, Not(key)) end oh_key = Flux.onehotbatch(training[!, key], labels) return hcat(select(training, Not(key)), DataFrame(oh_key', (key[1:4]*"_") .* splitLabel.(labels))) end # ╔═╡ 3b016552-6c5d-11eb-0818-096189d166bb dateConv(d) = Dates.DateTime(d, "yyyy-mm-dd HH:MM:SS") # ╔═╡ 49b50c02-6c5d-11eb-2c20-15a0ffa11887 convertDay(array) = Dates.value.(convert.(Dates.Day, round.(array, Dates.Day))) # ╔═╡ 4d686d12-6c5d-11eb-0f48-f9ad13682cd2 begin filename_training = "training_SC_GGP_AXA_FR.csv" filename_scoring = "scoring_SC_GGP_AXA_FR.csv" training = DataFrame(CSV.File(filename_training)) scoring = DataFrame(CSV.File(filename_scoring)) # Date conversion # Missing today_ = Dates.now() today_string = Dates.format(today_, "yyyy-mm-dd HH:MM:SS") replace!(training.ouverture_dernier_sinistre, missing => today_string) replace!(scoring.ouverture_dernier_sinistre, missing => today_string) replace!(training.cloture_dernier_sinistre, missing => today_string) replace!(scoring.cloture_dernier_sinistre, missing => today_string) # Date conversion date_keys = ["debut_contrat", "ouverture_dernier_sinistre", "cloture_dernier_sinistre"] for key in date_keys training[!, key] = dateConv.(training[!, key]) scoring[!, key] = dateConv.(scoring[!, key]) end # Add duree insertcols!(training, "duree_contrat" => convertDay(today_ - training.debut_contrat)) insertcols!(scoring, "duree_contrat" => convertDay(today_ - scoring.debut_contrat)) insertcols!(training, "duree_dernier_sinistre" => convertDay(training.cloture_dernier_sinistre - training.ouverture_dernier_sinistre)) insertcols!(scoring, "duree_dernier_sinistre" => convertDay(scoring.cloture_dernier_sinistre - scoring.ouverture_dernier_sinistre)) insertcols!(training, "duree_zero_sinistre" => convertDay(today_ - training.cloture_dernier_sinistre)) insertcols!(scoring, "duree_zero_sinistre" => convertDay(today_ - scoring.cloture_dernier_sinistre)) # Remove dates for key in date_keys select!(training, Not(key)) select!(scoring, Not(key)) end # One hot encode categorical categorical_keys = ["departement", "categorie_socio_professionnelle", "type_de_bien", "statut"] insert_bools = [true, true, true, true] for i in 1:4 key = categorical_keys[i] insert = insert_bools[i] labels = sort(unique(training[!, key])) global training = onehot_df(training, key, labels, insert) global scoring = onehot_df(scoring, key, labels, insert) end # drop useless select!(training, Not("index")) select!(scoring, Not("index")) training = Float32.(training) scoring = Float32.(scoring) #write to files Xscoring = Float32.(scoring) CSV.write("x"*filename_scoring, Xscoring) Xtraining = Float32.(select(training, Not("target"))) CSV.write("x"*filename_training, Xtraining) Ytraining = Float32.(training[!, "target"]) CSV.write("y"*filename_training, DataFrame(target=Ytraining)); end # ╔═╡ 9eb19f36-6c71-11eb-32da-394849a133bd training # ╔═╡ a231b4a2-6c71-11eb-306b-cf574cf3d093 scoring # ╔═╡ 97a6a438-6c72-11eb-3996-0f96b5765cfe describe(training) # ╔═╡ 9da64908-6c72-11eb-256e-9f683ffde6bc describe(scoring) # ╔═╡ Cell order: # ╠═21895968-6c5d-11eb-1800-cd0f921c6d83 # ╠═326f6ec0-6c5d-11eb-105a-ff2f251973c2 # ╠═3771c65c-6c5d-11eb-24d7-39b38b5bfde6 # ╠═3b016552-6c5d-11eb-0818-096189d166bb # ╠═49b50c02-6c5d-11eb-2c20-15a0ffa11887 # ╠═4d686d12-6c5d-11eb-0f48-f9ad13682cd2 # ╠═9eb19f36-6c71-11eb-32da-394849a133bd # ╠═a231b4a2-6c71-11eb-306b-cf574cf3d093 # ╠═97a6a438-6c72-11eb-3996-0f96b5765cfe # ╠═9da64908-6c72-11eb-256e-9f683ffde6bc