@ -14,13 +14,28 @@
#' @examples
#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
actorizer <- function ( out , localhost = F , ids , prefix , postfix , pre_tags , post_tags , es_super , ver ) {
sentencizer <- function ( row , out , ids , prefix , postfix , pre_tags , post_tags , pre_tags_regex , post_tags_regex ) {
doc <- out [row , ]
if ( sum ( nchar ( doc $ merged ) > 990000 ) ) {
offsetter <- function ( x , pre_tags , post_tags ) {
return ( as.list ( as.data.frame ( x - ( ( row ( x ) -1 ) * ( nchar ( pre_tags ) + nchar ( post_tags ) ) ) ) ) )
}
out <- mamlr ::: out_parser ( out , field = ' highlight' , clean = F )
prefix [prefix == ' ' ] <- NA
postfix [postfix == ' ' ] <- NA
pre_tags_regex <- gsub ( " ([.|()\\^{}+$*?]|\\[|\\])" , " \\\\\\1" , pre_tags )
post_tags_regex <- gsub ( " ([.|()\\^{}+$*?]|\\[|\\])" , " \\\\\\1" , post_tags )
out $ markers <- lapply ( str_locate_all ( out $ merged , coll ( pre_tags ) ) , offsetter , pre_tags = pre_tags , post_tags = post_tags )
markers <- out %>%
select ( `_id` , markers ) %>%
unnest_wider ( markers ) %>%
rename ( marker_start = start , marker_end = end ) %>%
unnest ( colnames ( .) )
if ( sum ( nchar ( out $ merged ) > 990000 ) > 0 ) {
stop ( " One or more documents in this batch exceed 990000 characters" )
}
# Extracting ud output from document
ud <- doc %>%
ud <- out %>%
select ( `_id` , `_source.ud` , merged ) %>%
unnest ( cols = c ( " _source.ud" ) ) %>%
select ( `_id` , lemma , start , end , sentence_id , merged ) %>%
@ -96,44 +111,17 @@ actorizer <- function(out, localhost = F, ids, prefix, postfix, pre_tags, post_t
sentence_count = first ( sentence_count ) # List of actor ids
) %>%
mutate (
prom = occ / sentence_count , # Relative prominence of actor in article (number of occu rences/total # sentences)
rel_first = 1 - ( first / sentence_count ) , # Relative position of first occu rence at sentence level
prom = occ / sentence_count , # Relative prominence of actor in article (number of occu r rences/total # sentences)
rel_first = 1 - ( first / sentence_count ) , # Relative position of first occu r rence at sentence level
) %>%
select ( `_id` : occ , prom , rel_first , first , ids )
return ( hits )
}
out <- mamlr ::: out_parser ( out , field = ' highlight' , clean = F )
offsetter <- function ( x , pre_tags , post_tags ) {
return ( as.list ( as.data.frame ( x - ( ( row ( x ) -1 ) * ( nchar ( pre_tags ) + nchar ( post_tags ) ) ) ) ) )
}
prefix [prefix == ' ' ] <- NA
postfix [postfix == ' ' ] <- NA
pre_tags_regex <- gsub ( " ([.|()\\^{}+$*?]|\\[|\\])" , " \\\\\\1" , pre_tags )
post_tags_regex <- gsub ( " ([.|()\\^{}+$*?]|\\[|\\])" , " \\\\\\1" , post_tags )
out $ markers <- future_lapply ( str_locate_all ( out $ merged , coll ( pre_tags ) ) , offsetter , pre_tags = pre_tags , post_tags = post_tags )
markers <- out %>%
select ( `_id` , markers ) %>%
unnest_wider ( markers ) %>%
rename ( marker_start = start , marker_end = end ) %>%
unnest ( colnames ( .) )
# ids <- fromJSON(ids)
updates <- sentencizer ( 1 : 1024 ,
out = out ,
ids = ids ,
postfix = postfix ,
prefix = prefix ,
pre_tags_regex = pre_tags_regex ,
pre_tags = pre_tags ,
post_tags_regex = post_tags_regex ,
post_tags = post_tags )
if ( nrow ( updates ) == 0 ) {
if ( nrow ( hits ) == 0 ) {
print ( " Nothing to update for this batch" )
return ( NULL )
} else {
bulk <- apply ( update s, 1 , bulk_writer , varname = ' actorsDetail' , type = ' add' , ver = ver )
bulk <- c ( bulk , apply ( update s[c ( 1 , 11 ) ] , 1 , bulk_writer , varname = ' actors' , type = ' add' , ver = ver ) )
bulk <- apply ( hits , 1 , bulk_writer , varname = ' actorsDetail' , type = ' add' , ver = ver )
bulk <- c ( bulk , apply ( hits [c ( 1 , 11 ) ] , 1 , bulk_writer , varname = ' actors' , type = ' add' , ver = ver ) )
return ( elastic_update ( bulk , es_super = es_super , localhost = localhost ) )
}