dt <- read_parquet(here::here("data-processed", "misto_renamed-n.parquet"))
dtl <- read_parquet(here::here("data-processed", "misto_long-geo.parquet"))
setdiff(dt$prj_id, dtl$prj_id)
## [1] "CZ.03.2.65/0.0/0.0/16_047/0009355" "CZ.05.4.27/0.0/0.0/17_088/0008643"
## [3] "CZ.05.4.27/0.0/0.0/17_088/0010521" "CZ.05.4.27/0.0/0.0/15_009/0001444"
## [5] "CZ.05.4.27/0.0/0.0/15_009/0001448" "CZ.05.4.27/0.0/0.0/15_010/0000089"
## [7] "CZ.05.4.27/0.0/0.0/17_058/0006131" "CZ.05.4.27/0.0/0.0/17_078/0005408"
## [9] "CZ.05.4.27/0.0/0.0/17_078/0008178" "CZ.05.4.27/0.0/0.0/17_078/0008605"
## [11] "CZ.05.4.27/0.0/0.0/17_078/0009044" "CZ.05.4.27/0.0/0.0/17_078/0009118"
## [13] "CZ.05.4.27/0.0/0.0/17_110/0010993" "CZ.05.4.27/0.0/0.0/18_091/0008066"
## [15] "CZ.05.4.27/0.0/0.0/18_106/0008947" "CZ.05.3.29/0.0/0.0/19_118/0010220"
## [17] "CZ.05.4.27/0.0/0.0/19_120/0010325" "CZ.05.4.27/0.0/0.0/19_120/0010358"
## [19] "CZ.05.4.27/0.0/0.0/19_120/0011029" "CZ.05.4.27/0.0/0.0/19_120/0011414"
## [21] "CZ.05.4.27/0.0/0.0/19_129/0010807" "CZ.05.4.27/0.0/0.0/19_131/0010038"
## [23] "CZ.05.4.27/0.0/0.0/19_131/0010314" "CZ.05.4.27/0.0/0.0/19_131/0010736"
## [25] "CZ.05.4.27/0.0/0.0/20_140/0012670" "CZ.05.4.27/0.0/0.0/20_140/0012671"
## [27] "CZ.05.4.27/0.0/0.0/20_140/0012672"
obec_id_e <- unique(dt$g_obec_id)
kraj_id_e <- unique(dt$g_kraj_id)
orp_id_e <- unique(dt$g_orp_id)
okres_id_e <- unique(dt$g_okres_id)
zuj_id_e <- unique(dt$g_zuj_id)
head(obec_id_e)
## [1] NA "CZ020539139" "CZ041555380" "CZ041538116" "CZ052569810" "CZ020539902"
head(zuj_id_e)
## [1] NA "CZ020564982" "CZ080555088" "CZ080598135" "CZ080599565" "CZ080506753"
head(orp_id_e)
## [1] NA "2122" "2105" "4103" "2103" "5309"
head(okres_id_e)
## [1] "CZ0209" NA "CZ020A" "CZ0412" "CZ0532" "CZ0311"
head(kraj_id_e)
## [1] NA "CZ020" "CZ041" "CZ053" "CZ080" "CZ064"
Děje se v 03_load-metadata.R
, aby se stahování a počítání dělalo jen jednou.
source("read_metadata.R")
obce_n <- obecnuts %>%
mutate(kod_n = paste0(CZNUTS, CHODNOTA2))
zuj_n <- zujnuts %>%
mutate(kod_n = paste0(CZNUTS, CHODNOTA2))
head(obce_n$kod_n)
## [1] "CZ010554782" "CZ020503410" "CZ020505781" "CZ020512991" "CZ020513032" "CZ020513041"
head(zuj_n$kod_n)
## [1] "CZ010500054" "CZ010500089" "CZ010500097" "CZ010500119" "CZ010500143" "CZ010500178"
dt_validated <- dt %>%
mutate(valid_kraj = g_kraj_id %in% csu_kraj$CZNUTS,
valid_obec = g_obec_id %in% obce_n$kod_n,
valid_orp = g_orp_id %in% csu_orp$CHODNOTA,
valid_okres = g_okres_id %in% csu_okres$CHODNOTA,
valid_zuj = g_zuj_id %in% zuj_n$kod_n) %>%
mutate(valid_kraj = if_else(is.na(g_kraj_id), NA, valid_kraj),
valid_obec = if_else(is.na(g_obec_id), NA, valid_obec),
valid_orp = if_else(is.na(g_orp_id), NA, valid_orp),
valid_okres = if_else(is.na(g_okres_id), NA, valid_okres),
valid_zuj = if_else(is.na(g_zuj_id), NA, valid_zuj)) %>%
select(matches(c("valid", "g_.*_id")))
skimr::skim(dt_validated)
Name | dt_validated |
Number of rows | 149584 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 5 |
logical | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
g_zuj_id | 142550 | 0.05 | 6 | 11 | 0 | 2071 | 0 |
g_obec_id | 90164 | 0.40 | 11 | 11 | 0 | 5614 | 0 |
g_orp_id | 141679 | 0.05 | 4 | 4 | 0 | 205 | 0 |
g_okres_id | 141539 | 0.05 | 6 | 6 | 0 | 77 | 0 |
g_kraj_id | 83075 | 0.44 | 5 | 5 | 0 | 15 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
valid_kraj | 83075 | 0.44 | 1.00 | TRU: 66508, FAL: 1 |
valid_obec | 90164 | 0.40 | 1.00 | TRU: 59418, FAL: 2 |
valid_orp | 141679 | 0.05 | 1.00 | TRU: 7905 |
valid_okres | 141539 | 0.05 | 1.00 | TRU: 8045 |
valid_zuj | 142550 | 0.05 | 0.75 | TRU: 5307, FAL: 1727 |
Něco se děje s kódy ZÚJ: část nesedí s číselníkem - a část je taky kratší.
dt %>%
mutate(zuj_len = str_length(g_zuj_id)) %>%
count(op_id, zuj_len) %>%
spread(zuj_len, n)
dt %>%
filter(str_length(g_zuj_id) < 11) %>%
distinct(op_id, g_zuj_id)
Což jsou podle všeho městské části Prahy (je jich 57).
Oprava kódů ZÚJ: ke kratším na začátek přilepíme kód kraje
dt_zujfix <- dt %>%
mutate(g_zuj_shortid = str_sub(g_zuj_id, -6, -1)) %>%
left_join(zujnuts %>%
select(g_zuj_shortid = CHODNOTA2, CZNUTS)) %>%
mutate(g_zuj_id_new = if_else(str_length(g_zuj_id) < 11,
paste0(CZNUTS, g_zuj_shortid), g_zuj_id),
g_zuj_id = g_zuj_id_new) %>%
select(-g_zuj_id_new, -CZNUTS)
## Joining, by = "g_zuj_shortid"
dt_zujfix %>%
mutate(ln = str_length(g_zuj_id)) %>%
count(ln)
Kontrola znova:
dt_zujfix_validated <- dt_zujfix %>%
mutate(valid_kraj = g_kraj_id %in% csu_kraj$CZNUTS,
valid_obec = g_obec_id %in% obce_n$kod_n,
valid_orp = g_orp_id %in% csu_orp$CHODNOTA,
valid_okres = g_okres_id %in% csu_okres$CHODNOTA,
valid_zuj = g_zuj_id %in% zuj_n$kod_n) %>%
mutate(valid_kraj = if_else(is.na(g_kraj_id), NA, valid_kraj),
valid_obec = if_else(is.na(g_obec_id), NA, valid_obec),
valid_orp = if_else(is.na(g_orp_id), NA, valid_orp),
valid_okres = if_else(is.na(g_okres_id), NA, valid_okres),
valid_zuj = if_else(is.na(g_zuj_id), NA, valid_zuj)) %>%
select(matches(c("valid", "g_.*_id")))
skimr::skim(dt_zujfix_validated)
Name | dt_zujfix_validated |
Number of rows | 149584 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 5 |
logical | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
g_zuj_id | 142550 | 0.05 | 11 | 11 | 0 | 2071 | 0 |
g_obec_id | 90164 | 0.40 | 11 | 11 | 0 | 5614 | 0 |
g_orp_id | 141679 | 0.05 | 4 | 4 | 0 | 205 | 0 |
g_okres_id | 141539 | 0.05 | 6 | 6 | 0 | 77 | 0 |
g_kraj_id | 83075 | 0.44 | 5 | 5 | 0 | 15 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
valid_kraj | 83075 | 0.44 | 1 | TRU: 66508, FAL: 1 |
valid_obec | 90164 | 0.40 | 1 | TRU: 59418, FAL: 2 |
valid_orp | 141679 | 0.05 | 1 | TRU: 7905 |
valid_okres | 141539 | 0.05 | 1 | TRU: 8045 |
valid_zuj | 142550 | 0.05 | 1 | TRU: 7034 |
Fajn: teď už to sedí: podíly chybějících jsou stejné v původních proměnních jako ve validačních, plus máme velmi málo chybných kódů - jen 2 kraje nesedí.
Které?
kraj_id_e[!kraj_id_e %in% csu_kraj$CZNUTS]
## [1] NA "PL225"
Polský region Bielski a NA
.
dt %>%
filter(prj_id == dt %>%
filter(g_kraj_id == "PL225") %>%
pull(prj_id))
Hmm, jeden projekt OP Z, který se odehrával jen v Polsku… Ale je to výzva na mezinárodní mobilitu, takže asi OK - prostě se jen vyřadí.
write_parquet(dt_zujfix, here::here("data-processed", "misto_fix-01-zuj.parquet"))
dt_fixed_long <- make_long_geo(dt_zujfix)
write_parquet(dt_fixed_long %>%
drop_na(value),
here::here("data-processed", "misto_fix-01-zuj_long-geo.parquet"))
obce_nomatch <- obec_id_e[!obec_id_e %in% paste0(obecnuts$CZNUTS, obecnuts$CHODNOTA2)]
obecnuts %>%
mutate(id = paste0(CZNUTS, CHODNOTA2)) %>%
filter(id %in% obce_nomatch[2]) %>%
pull(TEXT2)
## character(0)
dt_val_ids <- dt_zujfix %>%
# select(prj_id, starts_with("g_")) %>%
left_join(zuj_n %>%
select(g_zuj_nazev_csu = TEXT2,
g_zuj_id = kod_n)) %>%
left_join(obce_n %>%
select(g_obec_nazev_csu = TEXT2,
g_obec_id = kod_n)) %>%
left_join(csu_okres %>%
select(g_okres_nazev_csu = TEXT,
g_okres_id = CHODNOTA)) %>%
left_join(csu_orp %>%
select(g_orp_nazev_csu = TEXT,
g_orp_id = CHODNOTA)) %>%
left_join(csu_kraj %>%
select(g_kraj_nazev_csu = TEXT,
g_kraj_id = CZNUTS))
## Joining, by = "g_zuj_id"
## Joining, by = "g_obec_id"
## Joining, by = "g_okres_id"
## Joining, by = "g_orp_id"
## Joining, by = "g_kraj_id"
dt_val_ids_agent <- dt_val_ids %>%
mutate(across(starts_with("g_"), tolower)) %>%
create_agent() %>%
col_vals_equal(g_zuj_nazev,
vars(g_zuj_nazev_csu), na_pass = T) %>%
col_vals_equal(g_obec_nazev,
vars(g_obec_nazev_csu), na_pass = T) %>%
col_vals_equal(g_orp_nazev,
vars(g_orp_nazev_csu), na_pass = T) %>%
col_vals_equal(g_okres_nazev,
vars(g_okres_nazev_csu), na_pass = T) %>%
col_vals_equal(g_kraj_nazev,
vars(g_kraj_nazev_csu), na_pass = T)
dt_val_ids_agent %>% interrogate(extract_failed = F)
##
## ── Interrogation Started - there are ── 5 ─────────────────────────────────────────────────────────────────
## ✓ Step 1: OK.
## ✓ Step 2: OK.
## ✓ Step 3: OK.
## ✓ Step 4: OK.
## ✓ Step 5: OK.
##
## ── Interrogation Completed ────────────────────────────────────────────────────────────────────────────────
Pointblank Validation | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
[2021-01-14|11:57:19]
tibble
|
|||||||||||||
STEP | COLUMNS | VALUES | TBL | EVAL | ⋅ ⋅ ⋅ | PASS | FAIL | W | S | N | EXT | ||
1 | col_vals_equal()
|
→ |
✓ |
150K |
150K 0.99 |
1 0.01 |
— |
— |
— |
— |
|||
2 | col_vals_equal()
|
→ |
✓ |
150K |
150K 0.99 |
20 0.01 |
— |
— |
— |
— |
|||
3 | col_vals_equal()
|
→ |
✓ |
150K |
150K 1.00 |
0 0.00 |
— |
— |
— |
— |
|||
4 | col_vals_equal()
|
→ |
✓ |
150K |
150K 1.00 |
0 0.00 |
— |
— |
— |
— |
|||
5 | col_vals_equal()
|
→ |
✓ |
150K |
150K 1.00 |
0 0.00 |
— |
— |
— |
— |
|||
2021-01-14 11:57:49 CET< 1 s2021-01-14 11:57:50 CET |
dt_val_ids_agent %>%
interrogate() %>%
get_data_extracts(1) %>%
select(g_zuj_id, g_zuj_nazev, g_zuj_nazev_csu)
##
## ── Interrogation Started - there are ── 5 ─────────────────────────────────────────────────────────────────
## ✓ Step 1: OK.
## ✓ Step 2: OK.
## ✓ Step 3: OK.
## ✓ Step 4: OK.
## ✓ Step 5: OK.
##
## ── Interrogation Completed ────────────────────────────────────────────────────────────────────────────────
dt_val_ids_agent %>%
interrogate() %>%
get_data_extracts(2) %>%
select(g_obec_id, g_obec_nazev, g_obec_nazev_csu) %>%
distinct()
##
## ── Interrogation Started - there are ── 5 ─────────────────────────────────────────────────────────────────
## ✓ Step 1: OK.
## ✓ Step 2: OK.
## ✓ Step 3: OK.
## ✓ Step 4: OK.
## ✓ Step 5: OK.
##
## ── Interrogation Completed ────────────────────────────────────────────────────────────────────────────────
dt_val_ids_csunames <- dt_val_ids %>%
select(-g_zuj_nazev, -g_obec_nazev) %>%
rename(g_zuj_nazev = g_zuj_nazev_csu,
g_obec_nazev = g_obec_nazev_csu) %>%
select_at(vars(-ends_with("_csu")))
dt_val_ids_csunames_long <- make_long_geo(dt_val_ids_csunames) %>%
drop_na(value)
write_parquet(dt_val_ids_csunames,
here::here("data-processed",
"misto_fix-02-gnames.parquet"))
write_parquet(dt_val_ids_csunames_long,
here::here("data-processed",
"misto_fix-02-gnames_long-geo.parquet"))