actorizer, query_gen_actors: revamped actor searches entirely

elasticizer: updated script for use with ES 7.x
6 years ago · a1b6c6a7cb
parent 88fc4ec53c
commit a1b6c6a7cb
6 changed files with 289 additions and 216 deletions
--- a/1
+++ b/1
@ -7,6 +7,7 @@ export(dfm_gen)
 export(dupe_detect)
 export(elastic_update)
 export(elasticizer)
+export(lemma_writer)
 export(merger)
 export(modelizer)
 export(query_gen_actors)
--- a/R/actorizer.R
+++ b/R/actorizer.R
@ -12,17 +12,19 @@
 #' @return As this is a nested function used within elasticizer, there is no return output
 #' @export
 #' @examples
-#' actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super)
-actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier, es_super, ver) {
+#' actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
+actorizer <- function(out, localhost = F, ids, prefix, postfix, identifier, es_super, ver) {
  ### Function to filter out false positives using regex
-  exceptionizer <- function(id, ud, doc, markers, regex_identifier, prefix, postfix) {
+  exceptionizer <- function(id, ud, doc, markers, pre_tags_regex, post_tags_regex,pre_tags,post_tags, prefix, postfix) {
    min <- min(ud$start[ud$sentence_id == id]) # Get start position of sentence
    max <- max(ud$end[ud$sentence_id == id]) # Get end position of sentence
    split <- markers[markers %in% seq(min, max, 1)] # Get markers in sentence
-    max <- max+(length(split)*nchar(identifier)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
+    min <- min+((nchar(pre_tags)+nchar(post_tags))*((match(split,markers))-1))
+    max <- max+((nchar(pre_tags)+nchar(post_tags))*match(split,markers)) # Set end position to include markers (e.g if there are two markers of three characters in the sentence, the end position needs to be shifted by +6)
    sentence <- str_sub(doc$merged, min, max) # Extract sentence from text
+
    # Check if none of the regexes match, if so, return sentence id, otherwise (if one of the regexes match) return nothing
-    if (!str_detect(sentence, paste0(regex_identifier,postfix)) && !str_detect(sentence, paste0(prefix,regex_identifier))) {
+    if (!str_detect(sentence, paste0(post_tags_regex,'(',postfix,')')) && !str_detect(sentence, paste0('(',prefix,')',pre_tags_regex))) {
      return(id)
    } else {
      return(NULL)
@ -31,8 +33,16 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
  ranger <- function(x, ud) {
    return(which((ud$start <= x) & (ud$end >= x)))
  }
-  sentencizer <- function(row, out, ids, prefix, postfix, identifier, type) {
+  sentencizer <- function(row, out, ids, prefix, postfix, pre_tags, post_tags, pre_tags_regex, post_tags_regex) {
    doc <- out[row,]
+    if (nchar(doc$merged) > 990000) {
+      return(
+        data.frame(
+          err = T,
+          errorMessage = "Merged document exceeded 990000 characters, highlighting possibly incorrect"
+        )
+      )
+    }
    # Extracting ud output from document
    ud <- doc$`_source.ud`[[1]] %>%
      select(-one_of('exists')) %>% # Removing ud.exists variable
@ -48,11 +58,9 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier

    sentence_count <- max(ud$sentence_id) # Number of sentences in article
    actor_sentences <- unique(ud$sentence_id[ud$actor]) # Sentence ids of sentences mentioning actor
-    actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
-    actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor

-    # Conducting regex filtering on matches only when actor type is Party
-    if (type == "Party") {
+    # Conducting regex filtering on matches only when there is a prefix and/or postfix to apply
+    if (!is.na(prefix) || prefix != '' || !is.na(postfix) || postfix != '') {
      ### If no pre or postfixes, match *not nothing* i.e. anything
      if (is.na(prefix) || prefix == '') {
        prefix = '$^'
@ -60,14 +68,27 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
      if (is.na(postfix) || postfix == '') {
        postfix = '$^'
      }
-      sentence_ids <- lapply(actor_sentences, exceptionizer, ud = ud, doc = doc, markers = markers, regex_identifier = regex_identifier, prefix = prefix, postfix = postfix)
+      sentence_ids <- unlist(lapply(actor_sentences,
+                             exceptionizer,
+                             ud = ud,
+                             doc = doc,
+                             markers = markers,
+                             pre_tags_regex = pre_tags_regex,
+                             pre_tags = pre_tags,
+                             post_tags_regex = post_tags_regex,
+                             post_tags = post_tags,
+                             prefix = prefix,
+                             postfix = postfix))
    } else {
      sentence_ids <- actor_sentences
    }

    # Generating nested sentence start and end positions for actor sentences
    ud <- ud %>%
-      filter(sentence_id %in% sentence_ids) %>%
+      filter(sentence_id %in% sentence_ids)
+    actor_start <- ud$start[ud$actor == T] # Udpipe token start positions for actor
+    actor_end <- ud$end[ud$actor == T] # Udpipe token end positions for actor
+    ud <- ud %>%
      group_by(sentence_id) %>%
      summarise (
        sentence_start = as.integer(min(start)),
@ -80,28 +101,42 @@ actorizer <- function(out, localhost = F, ids, type, prefix, postfix, identifier
        sentence_start = list(sentence_start),
        sentence_end = list(sentence_end)
      )
-
-    return(
-      data.frame(ud, # Sentence id, start and end position for actor sentences
-                 actor_start = I(list(actor_start)), # List of actor ud token start positions
-                 actor_end = I(list(actor_end)), # List of actor ud token end positions
-                 occ = length(unique(actor_sentences)), # Number of sentences in which actor occurs
-                 prom = length(unique(actor_sentences))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
-                 rel_first = 1-(min(actor_sentences)/sentence_count), # Relative position of first occurence at sentence level
-                 first = min(actor_sentences), # First sentence in which actor is mentioned
-                 ids = I(list(ids)) # List of actor ids
+    if (length(ud$doc_id > 0)) {
+      return(
+        data.frame(ud, # Sentence id, start and end position for actor sentences
+                   actor_start = I(list(actor_start)), # List of actor ud token start positions
+                   actor_end = I(list(actor_end)), # List of actor ud token end positions
+                   occ = length(unique(sentence_ids)), # Number of sentences in which actor occurs
+                   prom = length(unique(sentence_ids))/sentence_count, # Relative prominence of actor in article (number of occurences/total # sentences)
+                   rel_first = 1-(min(sentence_ids)/sentence_count), # Relative position of first occurence at sentence level
+                   first = min(sentence_ids), # First sentence in which actor is mentioned
+                   ids = I(list(ids)) # List of actor ids
+        )
      )
-    )
+    } else {
+      return(NULL)
+    }
+
  }
  out <- mamlr:::out_parser(out, field = 'highlight', clean = F)
-  offsetter <- function(x, identifier) {
-    return(x-((row(x)-1)*nchar(identifier)))
+  offsetter <- function(x, pre_tags, post_tags) {
+    return(x-((row(x)-1)*(nchar(pre_tags)+nchar(post_tags))))
  }
-  regex_identifier <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", identifier)
-  out$markers <- mclapply(str_locate_all(out$merged,coll(identifier)), offsetter, identifier = identifier, mc.cores = detectCores())
+  pre_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", pre_tags)
+  post_tags_regex <- gsub("([.|()\\^{}+$*?]|\\[|\\])", "\\\\\\1", post_tags)
+  out$markers <- mclapply(str_locate_all(out$merged,coll(pre_tags)), offsetter, pre_tags = pre_tags, post_tags = post_tags, mc.cores = detectCores())

-  ids <- fromJSON(ids)
-  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer, out = out, ids = ids, postfix = postfix, prefix=prefix, identifier=identifier, type = type, mc.cores = detectCores()))
+  # ids <- fromJSON(ids)
+  updates <- bind_rows(mclapply(seq(1,length(out[[1]]),1), sentencizer,
+                                out = out,
+                                ids = ids,
+                                postfix = postfix,
+                                prefix=prefix,
+                                pre_tags_regex = pre_tags_regex,
+                                pre_tags = pre_tags,
+                                post_tags_regex = post_tags_regex,
+                                post_tags = post_tags,
+                                mc.cores = detectCores()))
  bulk <- apply(updates, 1, bulk_writer, varname ='actorsDetail', type = 'add', ver = ver)
  bulk <- c(bulk,apply(updates[c(1,11)], 1, bulk_writer, varname='actors', type = 'add', ver = ver))
  return(elastic_update(bulk, es_super = es_super, localhost = localhost))
--- a/R/elasticizer.R
+++ b/R/elasticizer.R
@ -68,11 +68,11 @@ elasticizer <- function(query, src = T, index = 'maml', es_pwd = .rs.askForPassw
    }
  }
  json <- fromJSON(res)
-  if (json$hits$total == 0) {
+  if (json$hits$total$value == 0) {
    return(json)
  } else {
    out <-  jsonlite:::flatten(json$hits$hits)
-    total <- json$hits$total
+    total <- json$hits$total$value
    hits <- length(json$hits$hits)
    batch <- 1
    print(paste0('Processing documents ',batch*batch_size-batch_size,' through ',batch*batch_size,' out of ',total,' documents.'))
--- a/R/query_gen_actors.R
+++ b/R/query_gen_actors.R
@ -3,7 +3,8 @@
 #' Generate actor search queries based on data in actor db
 #' @param actor A row from the output of elasticizer() when run on the 'actor' index
 #' @param country 2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.
-#' @param identifier Identifier used to mark hits in the text, identifiers are prepended before the actual hit
+#' @param pre_tags Highlighter pre-tag
+#' @param post_tags Highlighter post-tag
 #' @return A data frame containing the queries, related actor ids and actor function
 #' @export
 #' @examples
@ -12,134 +13,170 @@
 #################################################################################################
 #################################### Actor search query generator ###############################
 #################################################################################################
-query_gen_actors <- function(actor, country, identifier) {
-  highlight <- paste0('"highlight" : {
-                      "fields" : {
-                      "text" : {},
-                      "teaser" : {},
-                      "preteaser" : {},
-                      "title" : {},
-                      "subtitle" : {}
-                      },
-                      "number_of_fragments": 0,
-                      "order": "none",
-                      "type":"unified",
-                      "fragment_size":0,
-                      "pre_tags":"',identifier,'",
-                      "post_tags": ""
-}')
-if (country == "no") {
-  genitive <- 's'
-  definitive <- 'en'
-  definitive_genitive <- 'ens'
-} else if (country == 'uk') {
-  genitive <- '\'s'
-  definitive <- 's'
-  definitive_genitive <- ''
-} else {
-  genitive <- ''
-  definitive <- ''
-  definitive_genitive <- ''
-}
+query_gen_actors <- function(actor, country, pre_tags, post_tags) {
+  generator <- function(country, startdate, enddate, querystring, pre_tags, post_tags, actorid) {
+    return(paste0('{"query":
+                    {"bool": {
+                      "filter":[
+                        {"term":{"country":"',country,'"}},
+                        {"range":{"publication_date":{"gte":"',startdate,'","lte":"',enddate,'"}}},
+                        {"query_string" : {
+                          "default_operator" : "OR",
+                          "allow_leading_wildcard" : "false",
+                          "fields": ["text","teaser","preteaser","title","subtitle"],
+                          "query" : "', querystring,'"
+                          }
+                        }
+                      ],
+                      "must_not":[
+                        {"term":{"computerCodes.actors":"',actorid,'"}}
+                      ]
+                    }
+                  },
+                "highlight" : {
+                  "fields" : {
+                  "text" : {},
+                  "teaser" : {},
+                  "preteaser" : {},
+                  "title" : {},
+                  "subtitle" : {}
+                  },
+                  "number_of_fragments": 0,
+                  "order": "none",
+                  "type":"unified",
+                  "fragment_size":0,
+                  "pre_tags":"', pre_tags,'",
+                  "post_tags": "',post_tags,'"
+                }
+              }'))
+  }
+  ### Setting linguistic forms for each country ###
+  if (country == "no" | country == "dk") {
+    genitive <- 's'
+    definitive <- 'en'
+    definitive_genitive <- 'ens'
+  } else if (country == 'uk') {
+    genitive <- '\'s'
+  } else if (country == 'nl' | country == 'be') {
+    genitive <- 's'
+  }

-  if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader") {
+  ### Generating queries for individuals (ministers, PM, Party leaders and MPs)
+  if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM" | actor$`_source.function` == "PartyLeader" | actor$`_source.function` == "MP") {
    lastname <- paste0('(',actor$`_source.lastName`,' OR ',actor$`_source.lastName`,genitive,')')
    ## Adding a separate AND clause for inclusion of only last name to highlight all occurences of last name
    ## Regardless of whether the last name hit is because of a minister name or a full name proximity hit
-    query_string <- paste0('(((\\"',
-                           actor$`_source.firstName`,
-                           ' ',
-                           actor$`_source.lastName`,
-                           '\\"~5 OR \\"',
-                           actor$`_source.firstName`,
-                           ' ',
-                           actor$`_source.lastName`,genitive,
-                           '\\"~5) AND ',lastname)
-  }
-  if (actor$`_source.function` == "PartyLeader") {
-    query_string <- paste0(query_string,'))')
-    ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.partyId`),str_c, "_pl")))
-  }
-  if (actor$`_source.function` == "Min" | actor$`_source.function` == "PM") {
-    ## Modifiers are only applied to minister titles
-    capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
-    capital_gen <- unlist(lapply(capital, str_c, genitive))
-    capital_def <- unlist(lapply(capital, str_c, definitive))
-    capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
-    gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
-    def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
-    defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
-    names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ')
-    query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))')
-    ids <- toJSON(unlist(lapply(c(actor$`_source.actorId`,actor$`_source.ministryId`,actor$`_source.partyId`),str_c, "_min")))
-  }
-  if (actor$`_source.function` == "Institution") {
-    #uppercasing
-    firstup <- function(x) {
-      substr(x, 1, 1) <- toupper(substr(x, 1, 1))
-      x
+
+    ### If country is belgium, check if there is an apostrophe in middlenames, if so, search for last name both with capitalized and lowercased last name
+    if (country == 'be') {
+      if (T %in% str_detect(actor$`_source.middleNames`,"'")) {
+        query_string <- paste0('(((\\"',
+                               actor$`_source.firstName`,
+                               ' ',
+                               actor$`_source.lastName`,
+                               '\\"~5 OR \\"',
+                               actor$`_source.firstName`,
+                               ' ',
+                               tolower(actor$`_source.lastName`),
+                               '\\"~5 OR \\"',
+                               actor$`_source.firstName`,
+                               ' ',
+                               tolower(actor$`_source.lastName`),genitive,
+                               '\\"~5 OR \\"',
+                               actor$`_source.firstName`,
+                               ' ',
+                               actor$`_source.lastName`,genitive,
+                               '\\"~5) AND ',lastname)
+      }
+    } else {
+      query_string <- paste0('(((\\"',
+                             actor$`_source.firstName`,
+                             ' ',
+                             actor$`_source.lastName`,
+                             '\\"~5 OR \\"',
+                             actor$`_source.firstName`,
+                             ' ',
+                             actor$`_source.lastName`,genitive,
+                             '\\"~5) AND ',lastname)
    }
+
+
+    ### If actor is a minister, generate minister search
+    if (actor$`_source.function` == "Minister" | actor$`_source.function` == "PM") {
+      capital <- unlist(lapply(actor$`_source.ministerSearch`, str_to_title))
+      capital_gen <- unlist(lapply(capital, str_c, genitive))
+      gen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, genitive))
+      # If country is no or dk, search for definitive minister forms as well (including genitive forms)
+      if (country == "no" || country == "dk") {
+        capital_def <- unlist(lapply(capital, str_c, definitive))
+        capital_defgen <- unlist(lapply(capital, str_c, definitive_genitive))
+        def <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive))
+        defgen <- unlist(lapply(actor$`_source.ministerSearch`, str_c, definitive_genitive))
+        names <- paste(c(capital,capital_gen,gen,capital_def,def,defgen,capital_defgen), collapse = ' ')
+      } else {
+        names <- paste(c(capital,capital_gen,gen,actor$`_source.ministerSearch`), collapse = ' ')
+      }
+      # If country is uk, search for last name, minister names and minister titles (i.e. Johnson AND state AND secretary)
+      if (country == "uk") {
+        minister <- paste(c(str_to_title(actor$`_source.ministerName`),
+                      actor$`_source.ministerName`,
+                      str_c(str_to_title(actor$`_source.ministerName`),genitive),
+                      str_c(actor$`_source.ministerName`,genitive)), collapse = ' ')
+        query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND (',unlist(minister),')))')
+      }
+      # If country is nl or be, add a requirement for Minister to the query
+      else if (country == "nl" | country == "be") {
+        query_string <- paste0(query_string,') OR (',lastname,' AND (',names,') AND ("Minister" OR "minister")))')
+      } else {
+        query_string <- paste0(query_string,') OR (',lastname,' AND (',names,')))')
+      }
+    } else {
+      ### Else, generate search for first/last name only (MPs and Party leaders, currently)
+      query_string <- paste0(query_string,'))')
+    }
+    ids <- list(c(actor$`_source.actorId`,str_c(actor$`_source.partyId`,'_a')))
+    actorid <- actor$`_source.actorId`
+    query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+    return(data.frame(query = query, ids = I(ids), stringsAsFactors = F))
+  }
+  ### Query generation for party searches
+  if (actor$`_source.function` == "Party") {
    actor$`_source.startDate` <- "2000-01-01"
    actor$`_source.endDate` <- "2099-01-01"
-    if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
-      upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
-      upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
-                 unlist(lapply(upper, str_c, definitive)),
-                 unlist(lapply(upper, str_c, definitive_genitive)))
-      capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
-      capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
-                 unlist(lapply(capital, str_c, definitive)),
-                 unlist(lapply(capital, str_c, definitive_genitive)))
-      base <- actor$`_source.institutionNameSearch`
-      base <- c(base, unlist(lapply(base, str_c, genitive)),
-                   unlist(lapply(base, str_c, definitive)),
-                   unlist(lapply(base, str_c, definitive_genitive)))
-      names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
-      query_string <- paste0('(\\"',names,'\\")')
-      ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
-      query <- paste0('{"query":
-                    {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                    {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                    {"query_string" : {
-                    "default_operator" : "OR",
-                    "allow_leading_wildcard" : "false",
-                    "fields": ["text","teaser","preteaser","title","subtitle"],
-                    "query" : "', query_string,'"
-                    }
-                    }
-                    ]
-                    } },',highlight,' }')
-      df1 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F)
+    if (nchar(actor$`_source.partyNameSearchShort`[[1]]) > 0) {
+      # If uk, no or dk, search for both regular abbreviations, and genitive forms
+      if (country == "uk" | country == "no" | country == "dk") {
+        gen <- unlist(lapply(actor$`_source.partyNameSearchShort`, str_c, genitive))
+        names <- paste(c(gen,actor$`_source.partyNameSearchShort`), collapse = '\\\" \\\"')
+      } else {
+        names <- paste(actor$`_source.partyNameSearchShort`, collapse = '\\\" \\\"')
+      }
+      # If no or dk, only keep genitive forms if the party abbreviation is longer than 1 character (2 including the genitive s itself)
+      if (country == "dk" | country == "no") {
+        gen <- gen[which(nchar(gen) > 2)]
+      }
+      query_string <- paste0('(\\\"',names,'\\\")')
+      ids <- str_c(actor$`_source.partyId`,'_s')
+      actorid <- str_c(actor$`_source.partyId`,'_s')
+      query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+      df1 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
    }
-    if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
-      upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
-      upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
-                 unlist(lapply(upper, str_c, definitive)),
-                 unlist(lapply(upper, str_c, definitive_genitive)))
-      capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
-      capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
-                   unlist(lapply(capital, str_c, definitive)),
-                   unlist(lapply(capital, str_c, definitive_genitive)))
-      base <- actor$`_source.institutionNameSearchShort`
-      base <- c(base, unlist(lapply(base, str_c, genitive)),
-                unlist(lapply(base, str_c, definitive)),
-                unlist(lapply(base, str_c, definitive_genitive)))
-      names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
-      query_string <- paste0('(\\"',names,'\\")')
-      ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
-      query <- paste0('{"query":
-                    {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                    {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                    {"query_string" : {
-                    "default_operator" : "OR",
-                    "allow_leading_wildcard" : "false",
-                    "fields": ["text","teaser","preteaser","title","subtitle"],
-                    "query" : "', query_string,'"
-                    }
-                    }
-                    ]
-                    } },',highlight,' }')
-      df2 <- data.frame(query = query, ids = I(ids), type = actor$`_source.function`, stringsAsFactors = F)
+    if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
+      if (country == "uk" | country == "no" | country == "dk") {
+        gen <- unlist(lapply(actor$`_source.partyNameSearch`, str_c, genitive))
+        names <- paste(c(gen,actor$`_source.partyNameSearch`), collapse = '\\\" \\\"')
+      } else {
+        names <- paste(actor$`_source.partyNameSearch`, collapse = '\\\" \\\"')
+      }
+      query_string <- paste0('(\\\"',names,'\\\")')
+      ids <- str_c(actor$`_source.partyId`,'_f')
+      actorid <- str_c(actor$`_source.partyId`,'_f')
+      query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+      if (country == 'uk') {
+        df2 <- data.frame(query = query, ids = I(ids), prefix = actor$`_source.notPrecededBy`, postfix = actor$`_source.notFollowedBy`, stringsAsFactors = F)
+      } else {
+        df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
+      }
    }
    if (exists('df1') == T & exists('df2') == T) {
      return(bind_rows(df1,df2))
@ -149,63 +186,61 @@ if (country == "no") {
      return(df2)
    }
  }
-  if (actor$`_source.function` == "Party") {
-    actor$`_source.startDate` <- "2000-01-01"
-    actor$`_source.endDate` <- "2099-01-01"
-    names <- paste(c(unlist(actor$`_source.partyNameSearchShort`)), collapse = '\\" \\"')
-    query_string <- paste0('(\\"',names,'\\")')
-    query <- paste0('{"query":
-                    {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                    {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                    {"query_string" : {
-                    "default_operator" : "OR",
-                    "allow_leading_wildcard" : "false",
-                    "fields": ["text","teaser","preteaser","title","subtitle"],
-                    "query" : "', query_string,'"
-                    }
-                    }
-                    ]
-                    } },',highlight,' }')
-    ids <- c(toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
-
-    if (nchar(actor$`_source.partyNameSearch`[[1]]) > 0) {
-      names <- paste(c(unlist(actor$`_source.partyNameSearch`)), collapse = '\\" \\"')
-      query_string <- paste0('(\\"',names,'\\")')
-      query2 <- paste0('{"query":
-                       {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                       {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                       {"query_string" : {
-                       "default_operator" : "OR",
-                       "allow_leading_wildcard" : "false",
-                       "fields": ["text","teaser","preteaser","title","subtitle"],
-                       "query" : "', query_string,'"
-                       }
-                       }
-                       ]
-                       } },',highlight,' }')
-
-
-    ids <- c(ids, toJSON(unlist(lapply(c(actor$`_source.partyId`),str_c, "_p"))))
-    query <- c(query, query2)
-    fn <- c('PartyAbbreviation','Party')
-    } else {
-      fn <- c('PartyAbbreviation')
-    }
-    return(data.frame(query = query, ids = I(ids), type = fn, prefix = actor$`_source.searchAnd`, postfix = actor$`_source.searchAndNot`, stringsAsFactors = F))
-  }
-
-  query <- paste0('{"query":
-                  {"bool": {"filter":[{"term":{"country":"',country,'"}},
-                  {"range":{"publication_date":{"gte":"',actor$`_source.startDate`,'","lte":"',actor$`_source.endDate`,'"}}},
-                  {"query_string" : {
-                  "default_operator" : "OR",
-                  "allow_leading_wildcard" : "false",
-                  "fields": ["text","teaser","preteaser","title","subtitle"],
-                  "query" : "', query_string,'"
-                  }
-                  }
-                  ]
-                  } },',highlight,' }')
-  fn <- actor$`_source.function`
-  return(data.frame(query = query, ids = I(ids), type = fn, stringsAsFactors = F))
+  ### Institution function currently not used
+  # if (actor$`_source.function` == "Institution") {
+  #   #uppercasing
+  #   firstup <- function(x) {
+  #     substr(x, 1, 1) <- toupper(substr(x, 1, 1))
+  #     x
+  #   }
+  #   actor$`_source.startDate` <- "2000-01-01"
+  #   actor$`_source.endDate` <- "2099-01-01"
+  #   if (nchar(actor$`_source.institutionNameSearch`[[1]]) > 0) {
+  #     upper <- unlist(lapply(actor$`_source.institutionNameSearch`, firstup))
+  #     upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
+  #                unlist(lapply(upper, str_c, definitive)),
+  #                unlist(lapply(upper, str_c, definitive_genitive)))
+  #     capital <- unlist(lapply(actor$`_source.institutionNameSearch`, str_to_title))
+  #     capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
+  #                  unlist(lapply(capital, str_c, definitive)),
+  #                  unlist(lapply(capital, str_c, definitive_genitive)))
+  #     base <- actor$`_source.institutionNameSearch`
+  #     base <- c(base, unlist(lapply(base, str_c, genitive)),
+  #               unlist(lapply(base, str_c, definitive)),
+  #               unlist(lapply(base, str_c, definitive_genitive)))
+  #     names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
+  #     query_string <- paste0('(\\"',names,'\\")')
+  #     ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_f")))
+  #     actorid <- str_c(actor$`_source.institutionId`,'_f')
+  #     query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+  #     df1 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
+  #   }
+  #   if (nchar(actor$`_source.institutionNameSearchShort`[[1]]) > 0) {
+  #     upper <- unlist(lapply(actor$`_source.institutionNameSearchShort`, firstup))
+  #     upper <- c(upper, unlist(lapply(upper, str_c, genitive)),
+  #                unlist(lapply(upper, str_c, definitive)),
+  #                unlist(lapply(upper, str_c, definitive_genitive)))
+  #     capital <- unlist(lapply(actor$`_source.institutionNameSearchShort`, str_to_title))
+  #     capital <- c(capital, unlist(lapply(capital, str_c, genitive)),
+  #                  unlist(lapply(capital, str_c, definitive)),
+  #                  unlist(lapply(capital, str_c, definitive_genitive)))
+  #     base <- actor$`_source.institutionNameSearchShort`
+  #     base <- c(base, unlist(lapply(base, str_c, genitive)),
+  #               unlist(lapply(base, str_c, definitive)),
+  #               unlist(lapply(base, str_c, definitive_genitive)))
+  #     names <- paste(unique(c(upper,capital,base)), collapse = '\\" \\"')
+  #     query_string <- paste0('(\\"',names,'\\")')
+  #     ids <- toJSON(unlist(lapply(c(actor$`_source.institutionId`),str_c, "_s")))
+  #     actorid <- str_c(actor$`_source.institutionId`,'_s')
+  #     query <- generator(country, actor$`_source.startDate`, actor$`_source.endDate`, query_string, pre_tags, post_tags, actorid)
+  #     df2 <- data.frame(query = query, ids = I(ids), stringsAsFactors = F)
+  #   }
+  #   if (exists('df1') == T & exists('df2') == T) {
+  #     return(bind_rows(df1,df2))
+  #   } else if (exists('df1') == T) {
+  #     return(df1)
+  #   } else if (exists('df2') == T) {
+  #     return(df2)
+  #   }
+  # }
 }
--- a/man/actorizer.Rd
+++ b/man/actorizer.Rd
@ -4,8 +4,8 @@
 \alias{actorizer}
 \title{Updater function for elasticizer: Conduct actor searches}
 \usage{
-actorizer(out, localhost = F, ids, type, prefix, postfix, identifier,
-  es_super, ver)
+actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super,
+  ver)
 }
 \arguments{
 \item{out}{Does not need to be defined explicitly! (is already parsed in the elasticizer function)}
@ -31,5 +31,5 @@ As this is a nested function used within elasticizer, there is no return output
 Updater function for elasticizer: Conduct actor searches
 }
 \examples{
-actorizer(out, localhost = F, ids, type, prefix, postfix, identifier, es_super)
+actorizer(out, localhost = F, ids, prefix, postfix, identifier, es_super)
 }
--- a/man/query_gen_actors.Rd
+++ b/man/query_gen_actors.Rd
@ -4,14 +4,16 @@
 \alias{query_gen_actors}
 \title{Generate actor search queries based on data in actor db}
 \usage{
-query_gen_actors(actor, country, identifier)
+query_gen_actors(actor, country, pre_tags, post_tags)
 }
 \arguments{
 \item{actor}{A row from the output of elasticizer() when run on the 'actor' index}

 \item{country}{2-letter string indicating the country for which to generate the queries, is related to inflected nouns, definitive forms and genitive forms of names etc.}

-\item{identifier}{Identifier used to mark hits in the text, identifiers are prepended before the actual hit}
+\item{pre_tags}{Highlighter pre-tag}
+
+\item{post_tags}{Highlighter post-tag}
 }
 \value{
 A data frame containing the queries, related actor ids and actor function