removing text-level output from sentencizer, and optimizing storage by using factors

master
Erik de Vries 4 years ago
parent 523d86799c
commit cdc78039ed

@ -102,7 +102,9 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
yearmonth = strftime(publication_date, format = '%Y%m'), yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'), yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V") yearweek = strftime(publication_date, format = "%Y%V")
) ) %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(across(where(is.Date), as.factor))
return(output) return(output)
} else if(!is.null(actors_meta)) { } else if(!is.null(actors_meta)) {
text_noactors <- df[lengths(ids) == 0L, text_noactors <- df[lengths(ids) == 0L,
@ -229,7 +231,9 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
yearweek = strftime(publication_date, format = "%Y%V") yearweek = strftime(publication_date, format = "%Y%V")
) %>% ) %>%
ungroup() %>% ungroup() %>%
select(-contains('Search'),-starts_with('not')) select(-contains('Search'),-starts_with('not')) %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(across(where(is.Date), as.factor))
return(df) return(df)
} else { } else {
df <- text_sent %>% df <- text_sent %>%
@ -239,7 +243,10 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
yearmonthday = strftime(publication_date, format = '%Y%m%d'), yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V") yearweek = strftime(publication_date, format = "%Y%V")
) %>% ) %>%
ungroup() ungroup() %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(across(where(is.Date), as.factor))
return(df)
} }
} }

@ -17,7 +17,9 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
out <- out[row,] out <- out[row,]
## Create df with article metadata (fields that are included in the elasticizer function) ## Create df with article metadata (fields that are included in the elasticizer function)
metadata <- out %>% metadata <- out %>%
select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud")) select(`_id`,`_source.publication_date`,`_source.doctype`) %>%
mutate(`_source.publication_date` = as.factor(`_source.publication_date`),
`_source.doctype` = as.factor(`_source.doctype`))
## Unnest documents into individual words ## Unnest documents into individual words
ud_sent <- out %>% select(`_id`,`_source.ud`) %>% ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
@ -100,28 +102,8 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
left_join(ud_sent,.,by = c('_id','sentence_id')) %>% left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
group_by(`_id`) group_by(`_id`)
## If there is a sent_dict, generate sentiment scores on article level
if(!is.null(sent_dict)) {
text_sent <- out %>%
summarise(
text.words = sum(words),
text.sent_words = sum(sent_words),
text.sentences = n()
) #%>%
# mutate(
# text.arousal = text.sent_words/text.words
# )
} else {
text_sent <- out %>%
summarise(
text.words = sum(words),
text.sentences = n()
)
}
out <- out %>% out <- out %>%
summarise_all(list) %>% summarise_all(list) %>%
left_join(.,text_sent,by='_id') %>%
left_join(.,metadata,by='_id') %>% left_join(.,metadata,by='_id') %>%
ungroup() ungroup()
return(out) return(out)

Loading…
Cancel
Save