dt <- read_parquet(here::here("data-processed",
"misto_fix-02-gnames.parquet"))
dtl <- read_parquet(here::here("data-processed",
"misto_fix-02-gnames_long-geo.parquet"))
prj_meta <- read_parquet(here::here("data-processed", "prj-esif-meta.parquet"))
dt_geostructure_byprj <- read_parquet(here::here("data-processed", "dt_geostructure_by-prj.parquet"))
dt_geostructure_bylvl <- read_parquet(here::here("data-processed", "dt_geostructure_by-lvl.parquet"))
dt_geostatus <- read_parquet(here::here("data-processed", "projects-geo-check-groups.parquet"))
prj_id_nonhier <- dt_geostatus %>% filter(geostatus == "více míst nehierarchicky") %>%
pull(prj_id)
source(here::here("read_metadata.R"))
source(here::here("shared.R"))
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
dts <- dt %>%
select(op_id, starts_with("prj_"), starts_with("p_"), -prj_radek) %>%
distinct()
orgs <- read_parquet(here::here("data-processed", "orgs_sp.parquet"))
prj_no_obec <- read_parquet(here::here("data-processed", "prj_id_noobec.parquet"))
table(dt_geostatus$geostatus)
##
## bez místa více míst nehierarchicky více míst na stejné úrovni
## 27 119 5586
## více míst hierarchicky obec a ZUJ se kryjí jedno místo
## 9344 125 71471
dtl_smpl <- dtl %>%
# left_join(dt_geostructure_bylvl) %>%
filter(typ == "id") %>%
filter(level < "orp") %>%
left_join(dt_geostatus) %>%
# count(geostatus)
filter(!geostatus %in% c("více míst nehierarchicky")) %>%
# count(geostatus)
filter(!(geostatus == "obec a ZUJ se kryjí" & level == "zuj")) %>%
select(prj_id, op_id, value, level, rozpad_typ = geostatus) %>%
group_by(op_id) %>%
mutate(obec_puvod = "obec nebo ZUJ u projektů, kde byly ve vstupních datech",
rozpad_typ = as.character(rozpad_typ),
rozpad_duvod = NA_character_,
id_orig = value,
level_orig = as.character(level),
value = str_sub(value, 6, 11)) %>%
group_by(op_id) %>%
mutate(chunk = floor(row_number()/3e6) + 1,
chunk = as.integer(chunk)) %>%
group_by(prj_id) %>%
mutate(radek = row_number()) %>%
group_by(obec_puvod, op_id, chunk, rozpad_typ) %>%
select(prj_id, radek, op_id, level, geo_id = value, level_orig,
geo_id_orig = id_orig, rozpad_typ,
rozpad_duvod, obec_puvod, chunk) %>%
add_long_geoid(ids)
## Joining, by = c("op_id", "prj_id")
## Joining, by = c("level", "geo_id")
length(unique(dtl_smpl$prj_id))
## [1] 49464
dtl_smpl %>%
distinct(prj_id, rozpad_typ) %>%
count(rozpad_typ)
dtl_smpl %>%
left_join(dt_geostructure_bylvl) %>%
ungroup() %>%
skimr::skim()
## Joining, by = c("prj_id", "op_id", "level")
Data summary
Name |
Piped data |
Number of rows |
65769 |
Number of columns |
15 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
character |
9 |
logical |
2 |
numeric |
3 |
________________________ |
|
Group variables |
None |
Variable type: factor
level |
0 |
1 |
TRUE |
2 |
obe: 58963, zuj: 6806, orp: 0, okr: 0 |
Variable type: character
prj_id |
0 |
1.00 |
11 |
34 |
0 |
49464 |
0 |
op_id |
0 |
1.00 |
4 |
6 |
0 |
7 |
0 |
geo_id |
0 |
1.00 |
6 |
6 |
0 |
5732 |
0 |
level_orig |
0 |
1.00 |
3 |
4 |
0 |
2 |
0 |
geo_id_orig |
0 |
1.00 |
11 |
11 |
0 |
5732 |
0 |
rozpad_typ |
0 |
1.00 |
11 |
26 |
0 |
4 |
0 |
rozpad_duvod |
65769 |
0.00 |
NA |
NA |
0 |
0 |
0 |
obec_puvod |
0 |
1.00 |
54 |
54 |
0 |
1 |
0 |
geo_id_long |
4448 |
0.93 |
11 |
11 |
0 |
5710 |
0 |
Variable type: logical
min_level |
0 |
1 |
0.93 |
TRU: 61388, FAL: 4381 |
multiunit |
0 |
1 |
0.23 |
FAL: 50585, TRU: 15184 |
Variable type: numeric
radek |
0 |
1 |
4.27 |
15.28 |
1 |
1 |
1 |
1 |
251 |
▇▁▁▁▁ |
chunk |
0 |
1 |
1.00 |
0.00 |
1 |
1 |
1 |
1 |
1 |
▁▁▇▁▁ |
n_units |
0 |
1 |
7.27 |
25.89 |
1 |
1 |
1 |
1 |
251 |
▇▁▁▁▁ |
dir.create(here::here("data-processed", "dtl_resolved_smpl"), recursive = T, showWarnings = F)
write_parquet(dtl_smpl, here::here("data-processed", "dtl_resolved_smpl/all.parquet"))