removing text-level output from sentencizer, and optimizing storage by using factors

master
Erik de Vries 4 years ago
parent 523d86799c
commit cdc78039ed

@ -102,7 +102,9 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
yearmonth = strftime(publication_date, format = '%Y%m'),
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
)
) %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(across(where(is.Date), as.factor))
return(output)
} else if(!is.null(actors_meta)) {
text_noactors <- df[lengths(ids) == 0L,
@ -229,7 +231,9 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
yearweek = strftime(publication_date, format = "%Y%V")
) %>%
ungroup() %>%
select(-contains('Search'),-starts_with('not'))
select(-contains('Search'),-starts_with('not')) %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(across(where(is.Date), as.factor))
return(df)
} else {
df <- text_sent %>%
@ -239,7 +243,10 @@ actor_merger <- function(df, actors_meta = NULL, actor_groups = NULL, pos_cutoff
yearmonthday = strftime(publication_date, format = '%Y%m%d'),
yearweek = strftime(publication_date, format = "%Y%V")
) %>%
ungroup()
ungroup() %>%
mutate(across(where(is.character), as.factor)) %>%
mutate(across(where(is.Date), as.factor))
return(df)
}
}

@ -17,7 +17,9 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
out <- out[row,]
## Create df with article metadata (fields that are included in the elasticizer function)
metadata <- out %>%
select(`_id`,contains("_source"),-contains("computerCodes.actors"),-contains("ud"))
select(`_id`,`_source.publication_date`,`_source.doctype`) %>%
mutate(`_source.publication_date` = as.factor(`_source.publication_date`),
`_source.doctype` = as.factor(`_source.doctype`))
## Unnest documents into individual words
ud_sent <- out %>% select(`_id`,`_source.ud`) %>%
@ -100,28 +102,8 @@ sentencizer <- function(out, sent_dict = NULL, localhost = NULL, validation = F)
left_join(ud_sent,.,by = c('_id','sentence_id')) %>%
group_by(`_id`)
## If there is a sent_dict, generate sentiment scores on article level
if(!is.null(sent_dict)) {
text_sent <- out %>%
summarise(
text.words = sum(words),
text.sent_words = sum(sent_words),
text.sentences = n()
) #%>%
# mutate(
# text.arousal = text.sent_words/text.words
# )
} else {
text_sent <- out %>%
summarise(
text.words = sum(words),
text.sentences = n()
)
}
out <- out %>%
summarise_all(list) %>%
left_join(.,text_sent,by='_id') %>%
left_join(.,metadata,by='_id') %>%
ungroup()
return(out)

Loading…
Cancel
Save