You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

121 lines
4.0 KiB

### A Pluto.jl notebook ###
# v0.12.20
using Markdown
using InteractiveUtils
# ╔═╡ 21895968-6c5d-11eb-1800-cd0f921c6d83
begin
using DataFrames, CSV
using Dates
using Flux
end
# ╔═╡ 326f6ec0-6c5d-11eb-105a-ff2f251973c2
splitLabel(label) = split(label, " ")[1]
# ╔═╡ 3771c65c-6c5d-11eb-24d7-39b38b5bfde6
function onehot_df(training::DataFrame, key::String, labels::Array, insert::Bool)
if !insert
return select(training, Not(key))
end
oh_key = Flux.onehotbatch(training[!, key], labels)
return hcat(select(training, Not(key)), DataFrame(oh_key', (key[1:4]*"_") .* splitLabel.(labels)))
end
# ╔═╡ 3b016552-6c5d-11eb-0818-096189d166bb
dateConv(d) = Dates.DateTime(d, "yyyy-mm-dd HH:MM:SS")
# ╔═╡ 49b50c02-6c5d-11eb-2c20-15a0ffa11887
convertDay(array) = Dates.value.(convert.(Dates.Day, round.(array, Dates.Day)))
# ╔═╡ 4d686d12-6c5d-11eb-0f48-f9ad13682cd2
begin
filename_training = "training_SC_GGP_AXA_FR.csv"
filename_scoring = "scoring_SC_GGP_AXA_FR.csv"
training = DataFrame(CSV.File(filename_training))
scoring = DataFrame(CSV.File(filename_scoring))
# Date conversion
# Missing
today_ = Dates.now()
today_string = Dates.format(today_, "yyyy-mm-dd HH:MM:SS")
replace!(training.ouverture_dernier_sinistre, missing => today_string)
replace!(scoring.ouverture_dernier_sinistre, missing => today_string)
replace!(training.cloture_dernier_sinistre, missing => today_string)
replace!(scoring.cloture_dernier_sinistre, missing => today_string)
# Date conversion
date_keys = ["debut_contrat", "ouverture_dernier_sinistre", "cloture_dernier_sinistre"]
for key in date_keys
training[!, key] = dateConv.(training[!, key])
scoring[!, key] = dateConv.(scoring[!, key])
end
# Add duree
insertcols!(training, "duree_contrat" => convertDay(today_ - training.debut_contrat))
insertcols!(scoring, "duree_contrat" => convertDay(today_ - scoring.debut_contrat))
insertcols!(training, "duree_dernier_sinistre" => convertDay(training.cloture_dernier_sinistre - training.ouverture_dernier_sinistre))
insertcols!(scoring, "duree_dernier_sinistre" => convertDay(scoring.cloture_dernier_sinistre - scoring.ouverture_dernier_sinistre))
insertcols!(training, "duree_zero_sinistre" => convertDay(today_ - training.cloture_dernier_sinistre))
insertcols!(scoring, "duree_zero_sinistre" => convertDay(today_ - scoring.cloture_dernier_sinistre))
# Remove dates
for key in date_keys
select!(training, Not(key))
select!(scoring, Not(key))
end
# One hot encode categorical
categorical_keys = ["departement", "categorie_socio_professionnelle", "type_de_bien", "statut"]
insert_bools = [true, true, true, true]
for i in 1:4
key = categorical_keys[i]
insert = insert_bools[i]
labels = sort(unique(training[!, key]))
global training = onehot_df(training, key, labels, insert)
global scoring = onehot_df(scoring, key, labels, insert)
end
# drop useless
select!(training, Not("index"))
select!(scoring, Not("index"))
training = Float32.(training)
scoring = Float32.(scoring)
#write to files
Xscoring = Float32.(scoring)
CSV.write("x"*filename_scoring, Xscoring)
Xtraining = Float32.(select(training, Not("target")))
CSV.write("x"*filename_training, Xtraining)
Ytraining = Float32.(training[!, "target"])
CSV.write("y"*filename_training, DataFrame(target=Ytraining));
end
# ╔═╡ 9eb19f36-6c71-11eb-32da-394849a133bd
training
# ╔═╡ a231b4a2-6c71-11eb-306b-cf574cf3d093
scoring
# ╔═╡ 97a6a438-6c72-11eb-3996-0f96b5765cfe
describe(training)
# ╔═╡ 9da64908-6c72-11eb-256e-9f683ffde6bc
describe(scoring)
# ╔═╡ Cell order:
# ╠═21895968-6c5d-11eb-1800-cd0f921c6d83
# ╠═326f6ec0-6c5d-11eb-105a-ff2f251973c2
# ╠═3771c65c-6c5d-11eb-24d7-39b38b5bfde6
# ╠═3b016552-6c5d-11eb-0818-096189d166bb
# ╠═49b50c02-6c5d-11eb-2c20-15a0ffa11887
# ╠═4d686d12-6c5d-11eb-0f48-f9ad13682cd2
# ╠═9eb19f36-6c71-11eb-32da-394849a133bd
# ╠═a231b4a2-6c71-11eb-306b-cf574cf3d093
# ╠═97a6a438-6c72-11eb-3996-0f96b5765cfe
# ╠═9da64908-6c72-11eb-256e-9f683ffde6bc