dt <- read_parquet(here::here("data-processed", 
                              "misto_fix-02-gnames.parquet"))
dtl <- read_parquet(here::here("data-processed", 
                               "misto_fix-02-gnames_long-geo.parquet"))

prj_meta <- read_parquet(here::here("data-processed", "prj-esif-meta.parquet"))
dt_geostructure_byprj <- read_parquet(here::here("data-processed", "dt_geostructure_by-prj.parquet"))
dt_geostructure_bylvl <- read_parquet(here::here("data-processed", "dt_geostructure_by-lvl.parquet"))
dt_geostatus <- read_parquet(here::here("data-processed", "projects-geo-check-groups.parquet"))
prj_id_nonhier <- dt_geostatus %>% filter(geostatus == "více míst nehierarchicky") %>% 
  pull(prj_id)
source(here::here("read_metadata.R"))
source(here::here("shared.R"))
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
dts <- dt %>% 
  select(op_id, starts_with("prj_"), starts_with("p_"), -prj_radek) %>% 
  distinct()
orgs <- read_parquet(here::here("data-processed", "orgs_sp.parquet"))
prj_no_obec <- read_parquet(here::here("data-processed", "prj_id_noobec.parquet"))
table(dt_geostatus$geostatus)
## 
##                  bez místa   více míst nehierarchicky více míst na stejné úrovni 
##                         27                        119                       5586 
##     více míst hierarchicky        obec a ZUJ se kryjí                jedno místo 
##                       9344                        125                      71471
dtl_smpl <- dtl %>% 
  # left_join(dt_geostructure_bylvl) %>%
  filter(typ == "id") %>% 
  filter(level < "orp") %>% 
  left_join(dt_geostatus) %>%
  # count(geostatus)
  filter(!geostatus %in% c("více míst nehierarchicky")) %>% 
  # count(geostatus)
  filter(!(geostatus == "obec a ZUJ se kryjí" & level == "zuj")) %>% 
  select(prj_id, op_id, value, level, rozpad_typ = geostatus) %>% 
  group_by(op_id) %>% 
  mutate(obec_puvod = "obec nebo ZUJ u projektů, kde byly ve vstupních datech",
         rozpad_typ = as.character(rozpad_typ),
         rozpad_duvod = NA_character_,
         id_orig = value,
         level_orig = as.character(level),
         value = str_sub(value, 6, 11)) %>% 
  group_by(op_id) %>% 
  mutate(chunk = floor(row_number()/3e6) + 1,
         chunk = as.integer(chunk)) %>% 
  group_by(prj_id) %>% 
  mutate(radek = row_number()) %>% 
  group_by(obec_puvod, op_id, chunk, rozpad_typ) %>% 
  select(prj_id, radek, op_id, level, geo_id = value, level_orig,
         geo_id_orig = id_orig, rozpad_typ, 
         rozpad_duvod, obec_puvod, chunk) %>% 
  add_long_geoid(ids)  
## Joining, by = c("op_id", "prj_id")
## Joining, by = c("level", "geo_id")
length(unique(dtl_smpl$prj_id))
## [1] 49464
dtl_smpl %>% 
  distinct(prj_id, rozpad_typ) %>% 
  count(rozpad_typ)
dtl_smpl %>%
  left_join(dt_geostructure_bylvl) %>% 
  ungroup() %>% 
  skimr::skim() 
## Joining, by = c("prj_id", "op_id", "level")
Data summary
Name Piped data
Number of rows 65769
Number of columns 15
_______________________
Column type frequency:
factor 1
character 9
logical 2
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
level 0 1 TRUE 2 obe: 58963, zuj: 6806, orp: 0, okr: 0

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
prj_id 0 1.00 11 34 0 49464 0
op_id 0 1.00 4 6 0 7 0
geo_id 0 1.00 6 6 0 5732 0
level_orig 0 1.00 3 4 0 2 0
geo_id_orig 0 1.00 11 11 0 5732 0
rozpad_typ 0 1.00 11 26 0 4 0
rozpad_duvod 65769 0.00 NA NA 0 0 0
obec_puvod 0 1.00 54 54 0 1 0
geo_id_long 4448 0.93 11 11 0 5710 0

Variable type: logical

skim_variable n_missing complete_rate mean count
min_level 0 1 0.93 TRU: 61388, FAL: 4381
multiunit 0 1 0.23 FAL: 50585, TRU: 15184

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
radek 0 1 4.27 15.28 1 1 1 1 251 ▇▁▁▁▁
chunk 0 1 1.00 0.00 1 1 1 1 1 ▁▁▇▁▁
n_units 0 1 7.27 25.89 1 1 1 1 251 ▇▁▁▁▁
dir.create(here::here("data-processed", "dtl_resolved_smpl"), recursive = T, showWarnings = F)
write_parquet(dtl_smpl, here::here("data-processed", "dtl_resolved_smpl/all.parquet"))