You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mamlr/man/dupe_detect.Rd

47 lines
1.7 KiB

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dupe_detect.R
\name{dupe_detect}
\alias{dupe_detect}
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
\usage{
dupe_detect(
row,
grid,
cutoff_lower,
cutoff_upper = 1,
es_pwd,
es_super,
words,
localhost = T,
ver
)
}
\arguments{
\item{row}{Row of grid to parse}
\item{grid}{A cross-table of all possible combinations of doctypes and dates}
\item{cutoff_lower}{Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)}
\item{cutoff_upper}{Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)}
\item{es_pwd}{Password for Elasticsearch read access}
\item{es_super}{Password for write access to ElasticSearch}
\item{words}{Document cutoff point in number of words. Documents are cut off at the last [.?!] before the cutoff (so document will be a little shorter than [words])}
\item{localhost}{Defaults to true. When true, connect to a local Elasticsearch instance on the default port (9200)}
\item{ver}{Short string (preferably a single word/sequence) indicating the version of the updated document (i.e. for a udpipe update this string might be 'udV2')}
}
\value{
dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
}
\description{
Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
}
\examples{
dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd, es_super, words, localhost = T)
}