diff --git a/DESCRIPTION b/DESCRIPTION index a4f0d7d..8b53750 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,7 +16,8 @@ Depends: R (>= 3.3.1), udpipe, SparseM, future, - future.apply + future.apply, + data.table (>=1.9.8) License: Copyright Erik de Vries Encoding: UTF-8 LazyData: true diff --git a/R/actorizer.R b/R/actorizer.R index 7a00ea6..14d6917 100644 --- a/R/actorizer.R +++ b/R/actorizer.R @@ -50,19 +50,12 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t mutate( sentence_count = n() ) - - hits <- left_join(ud, markers, by='_id') %>% - mutate( - actor = case_when( - start <= marker_start & end >= marker_start ~ T, - T ~ F - ) - ) %>% - select(`_id`, sentence_id, start, end,actor,merged) %>% - filter(actor) %>% + hits <- as.data.table(ud)[as.data.table(markers), .(`_id`, lemma,x.start, start, end, x.end, sentence_id, merged), on =.(`_id` = `_id`, start <= marker_start, end >= marker_start)] %>% + mutate(end = x.end, + start = x.start) %>% + select(`_id`, sentence_id, start, end,merged) %>% group_by(`_id`,sentence_id) %>% summarise( - actor = any(actor), actor_start = I(list(start)), actor_end = I(list(end)), n_markers = length(start),