CREATE TABLE titanik_train_3 ( id bigint, survived integer, pclass integer, name character varying(255), sex character varying(255), age double precision, sibsp integer, parch integer, ticket character varying(255), fare double precision, cabin character varying(255), embarked character varying(255), cabin_cnt integer, cabin_type text, ticket_type text, ticket_number text, cabin_people_cnt integer )
select a.* into titanik_full_1 from ( select * from titanik_train_3 union select id, NULL::integer as survived, pclass, "name", sex , age , sibsp , parch, ticket,fare,cabin,embarked,cabin_cnt,cabin_type,ticket_type,ticket_number, cabin_people_cnt from titanik_test_3 ) as a;
select a.* into titanik_full_2 from ( select id, survived, pclass::float, age::float, sibsp::float, parch::float, fare::float, cabin_cnt::float, CAST(ticket_number as float) as ticket_number, cabin_people_cnt::float from titanik_full_1 where ticket_number != '' union select id, survived, pclass, age, sibsp, parch, fare, cabin_cnt, 0 as ticket_number, cabin_people_cnt from titanik_full_1 where ticket_number = '' ) as a;
CREATE TABLE titanik_full_2 ( id bigint, survived integer, pclass integer, age double precision, sibsp integer, parch integer, fare double precision, cabin_cnt integer, ticket_number integer, cabin_people_cnt bigint )
CREATE OR REPLACE FUNCTION sparse_matrix_generator( tablename_source character varying, tablename_dest character varying, field_name character varying) RETURNS integer AS $$ DECLARE pgst_object REFCURSOR; unival character varying; BEGIN OPEN pgst_object FOR EXECUTE 'select distinct '||field_name ||' from '||tablename_source ||' where ' || field_name ||' NOTNULL'; LOOP FETCH pgst_object INTO unival; EXIT WHEN NOT FOUND; EXECUTE 'ALTER TABLE '|| tablename_dest ||' ADD COLUMN "'|| field_name||unival ||'" smallint NOT NULL DEFAULT 0'; EXECUTE 'UPDATE '||tablename_dest||' SET "'||field_name||unival|| '"= 1 FROM ' ||tablename_source|| ' WHERE '||tablename_dest||'.id = '||tablename_source||'.id and '||field_name||' = '''||unival||''''; END LOOP; RETURN 0; END; $$ LANGUAGE 'plpgsql';
select sparse_matrix_generator('titanik_full_1', 'titanik_full_2', 'cabin_type'); select sparse_matrix_generator('titanik_full_1', 'titanik_full_2', 'ticket_type'); select sparse_matrix_generator('titanik_full_1', 'titanik_full_2', 'embarked'); select sparse_matrix_generator('titanik_full_1', 'titanik_full_2', 'sex');
CREATE OR REPLACE FUNCTION minmax_normalizer(tablename_source character varying, field_name character varying) RETURNS integer AS $BODY$ DECLARE pgst_object REFCURSOR; maxval float; minval float; C RECORD; BEGIN EXECUTE 'select min("'||field_name ||'") as minval, max("'||field_name ||'") as maxval from '|| tablename_source INTO C; maxval := C.maxval; minval := C.minval; EXECUTE 'UPDATE '||tablename_source||' SET "'||field_name||'"=("'||field_name||'"-$1)/($2-$1)' USING minval, maxval; RETURN 0; END; $BODY$ LANGUAGE plpgsql VOLATILE COST 100;
select minmax_normalizer('titanik_full_2', 'pclass'); select minmax_normalizer('titanik_full_2', 'age'); select minmax_normalizer('titanik_full_2', 'sibsp'); select minmax_normalizer('titanik_full_2', 'parch'); select minmax_normalizer('titanik_full_2', 'fare'); select minmax_normalizer('titanik_full_2', 'cabin_cnt'); select minmax_normalizer('titanik_full_2', 'ticket_number'); select minmax_normalizer('titanik_full_2', 'cabin_people_cnt');
select * into titanik_test_final from titanik_full_2 where survived isnull; alter table titanik_test_final drop column survived;
select * into titanik_train_final from titanik_full_2 where survived notnull;
CREATE OR REPLACE FUNCTION null_normalizer(tablename_source character varying) RETURNS integer AS $BODY$ DECLARE pgst_object REFCURSOR; fieldval character varying; count_null integer; field_avg float; BEGIN OPEN pgst_object FOR EXECUTE 'select column_name from information_schema.columns where'|| ' table_name='''||tablename_source||''''; LOOP FETCH pgst_object INTO fieldval; EXIT WHEN NOT FOUND; count_null := 0; EXECUTE 'select count(id) from '||tablename_source||' where "'||fieldval||'" isnull' into count_null; IF count_null > 0 THEN raise notice 'field: %', fieldval; EXECUTE 'select avg('||fieldval||') from '||tablename_source INTO field_avg; EXECUTE 'UPDATE '||tablename_source||' set '||fieldval||'= $1 where '||fieldval||' isnull' using field_avg; END IF; END LOOP; RETURN 0; END; $BODY$ LANGUAGE plpgsql VOLATILE COST 100;
select null_normalizer('titanik_test_final'); select null_normalizer('titanik_train_final');
CREATE TABLE titanik_test_final ( id bigint, pclass double precision, age double precision, sibsp double precision, parch double precision, fare double precision, cabin_cnt double precision, ticket_number double precision, cabin_people_cnt double precision, "cabin_typeF" smallint, "cabin_typeB" smallint, "cabin_typeG" smallint, "cabin_typeC" smallint, "cabin_typeT" smallint, "cabin_typeD" smallint, "cabin_typeE" smallint, "cabin_typeA" smallint, "ticket_typeSW/PP" smallint, "ticket_typeC" smallint, "ticket_typePC" smallint, "ticket_typeAQ/3." smallint, "ticket_typeSC/A.3" smallint, "ticket_typeS.OC" smallint, "ticket_typeS.O./PP" smallint, "ticket_typeSC/AH" smallint, "ticket_typeSOTON/O2" smallint, "ticket_typeC.A." smallint, "ticket_typeW/C" smallint, "ticket_typeS.C./A.4." smallint, "ticket_typeFa" smallint, "ticket_typeLP" smallint, "ticket_typeSCO/W" smallint, "ticket_typeF.C." smallint, "ticket_typeA.5." smallint, "ticket_typeSC/AH Basle" smallint, "ticket_typeSC/A4" smallint, "ticket_typeS.C./PARIS" smallint, "ticket_typeS.OP" smallint, "ticket_typeLINE" smallint, "ticket_typeSO/C" smallint, "ticket_typeP/PP" smallint, "ticket_typeAQ/4" smallint, "ticket_typeSC" smallint, "ticket_typeW.EP" smallint, "ticket_typeSOTON/OQ" smallint, "ticket_typeA/4" smallint, "ticket_typeSC/PARIS" smallint, "ticket_typeA. 2." smallint, "ticket_typeF.CC" smallint, "ticket_typeS.P." smallint, "ticket_typePP" smallint, "ticket_typeC.A./SOTON" smallint, "embarkedC" smallint, "embarkedQ" smallint, "embarkedS" smallint, sexfemale smallint, sexmale smallint )
COPY titanik_train_final to '/tmp/titanik_train_final.csv'; COPY titanik_test_final to '/tmp/titanik_test_final.csv';
Source: https://habr.com/ru/post/165283/
All Articles