dt <- read_parquet(here::here("data-processed",
"misto_fix-02-gnames.parquet"))
dtl <- read_parquet(here::here("data-processed",
"misto_fix-02-gnames_long-geo.parquet"))
prj_meta <- read_parquet(here::here("data-processed", "prj-esif-meta.parquet"))
dt_geostructure_byprj <- read_parquet(here::here("data-processed", "dt_geostructure_by-prj.parquet"))
dt_geostructure_bylvl <- read_parquet(here::here("data-processed", "dt_geostructure_by-lvl.parquet"))
dt_geostatus <- read_parquet(here::here("data-processed", "projects-geo-check-groups.parquet"))
prj_id_nonhier <- dt_geostatus %>% filter(geostatus == "více míst nehierarchicky") %>%
pull(prj_id)
source(here::here("read_metadata.R"))
source(here::here("shared.R"))
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
dts <- dt %>%
select(op_id, starts_with("prj_"), starts_with("p_"), -prj_radek) %>%
distinct()
orgs <- read_parquet(here::here("data-processed", "orgs_sp.parquet"))
prj_no_obec <- read_parquet(here::here("data-processed", "prj_id_noobec.parquet"))
table(dt_geostatus$geostatus)
##
## bez místa více míst nehierarchicky více míst na stejné úrovni
## 27 119 5586
## více míst hierarchicky obec a ZUJ se kryjí jedno místo
## 9344 125 71471
dtl_smpl <- dtl %>%
# left_join(dt_geostructure_bylvl) %>%
filter(typ == "id") %>%
filter(level < "orp") %>%
left_join(dt_geostatus) %>%
# count(geostatus)
filter(!geostatus %in% c("více míst nehierarchicky")) %>%
# count(geostatus)
filter(!(geostatus == "obec a ZUJ se kryjí" & level == "zuj")) %>%
select(prj_id, op_id, value, level, rozpad_typ = geostatus) %>%
group_by(op_id) %>%
mutate(obec_puvod = "obec nebo ZUJ u projektů, kde byly ve vstupních datech",
rozpad_typ = as.character(rozpad_typ),
rozpad_duvod = NA_character_,
id_orig = value,
level_orig = as.character(level),
value = str_sub(value, 6, 11)) %>%
group_by(op_id) %>%
mutate(chunk = floor(row_number()/3e6) + 1,
chunk = as.integer(chunk)) %>%
group_by(prj_id) %>%
mutate(radek = row_number()) %>%
group_by(obec_puvod, op_id, chunk, rozpad_typ) %>%
select(prj_id, radek, op_id, level, geo_id = value, level_orig,
geo_id_orig = id_orig, rozpad_typ,
rozpad_duvod, obec_puvod, chunk) %>%
add_long_geoid(ids)
## Joining, by = c("op_id", "prj_id")
## Joining, by = c("level", "geo_id")
length(unique(dtl_smpl$prj_id))
## [1] 49464
dtl_smpl %>%
distinct(prj_id, rozpad_typ) %>%
count(rozpad_typ)
obec_puvod <chr> | op_id <chr> | chunk <int> | |
---|---|---|---|
obec nebo ZUJ u projektů, kde byly ve vstupních datech | IROP | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | IROP | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | IROP | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | IROP | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | OP D | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | OP D | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | OP D | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | OP D | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | OP PIK | 1 | |
obec nebo ZUJ u projektů, kde byly ve vstupních datech | OP PIK | 1 |
dtl_smpl %>%
left_join(dt_geostructure_bylvl) %>%
ungroup() %>%
skimr::skim()
## Joining, by = c("prj_id", "op_id", "level")
Name | Piped data |
Number of rows | 65769 |
Number of columns | 15 |
_______________________ | |
Column type frequency: | |
factor | 1 |
character | 9 |
logical | 2 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
level | 0 | 1 | TRUE | 2 | obe: 58963, zuj: 6806, orp: 0, okr: 0 |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
prj_id | 0 | 1.00 | 11 | 34 | 0 | 49464 | 0 |
op_id | 0 | 1.00 | 4 | 6 | 0 | 7 | 0 |
geo_id | 0 | 1.00 | 6 | 6 | 0 | 5732 | 0 |
level_orig | 0 | 1.00 | 3 | 4 | 0 | 2 | 0 |
geo_id_orig | 0 | 1.00 | 11 | 11 | 0 | 5732 | 0 |
rozpad_typ | 0 | 1.00 | 11 | 26 | 0 | 4 | 0 |
rozpad_duvod | 65769 | 0.00 | NA | NA | 0 | 0 | 0 |
obec_puvod | 0 | 1.00 | 54 | 54 | 0 | 1 | 0 |
geo_id_long | 4448 | 0.93 | 11 | 11 | 0 | 5710 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
min_level | 0 | 1 | 0.93 | TRU: 61388, FAL: 4381 |
multiunit | 0 | 1 | 0.23 | FAL: 50585, TRU: 15184 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
radek | 0 | 1 | 4.27 | 15.28 | 1 | 1 | 1 | 1 | 251 | ▇▁▁▁▁ |
chunk | 0 | 1 | 1.00 | 0.00 | 1 | 1 | 1 | 1 | 1 | ▁▁▇▁▁ |
n_units | 0 | 1 | 7.27 | 25.89 | 1 | 1 | 1 | 1 | 251 | ▇▁▁▁▁ |
dir.create(here::here("data-processed", "dtl_resolved_smpl"), recursive = T, showWarnings = F)
write_parquet(dtl_smpl, here::here("data-processed", "dtl_resolved_smpl/all.parquet"))