You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
29 lines
1.1 KiB
29 lines
1.1 KiB
% Generated by roxygen2: do not edit by hand
|
|
% Please edit documentation in R/dupe_detect.R
|
|
\name{dupe_detect}
|
|
\alias{dupe_detect}
|
|
\title{Get ids of duplicate documents that have a cosine similarity score higher than [threshold]}
|
|
\usage{
|
|
dupe_detect(row, grid, cutoff_lower, cutoff_upper = 1, es_pwd)
|
|
}
|
|
\arguments{
|
|
\item{row}{Row of grid to parse}
|
|
|
|
\item{grid}{A cross-table of all possible combinations of doctypes and dates}
|
|
|
|
\item{cutoff_lower}{Cutoff value for minimum cosine similarity above which documents are considered duplicates (inclusive)}
|
|
|
|
\item{cutoff_upper}{Cutoff value for maximum cosine similarity, above which documents are not considered duplicates (for debugging and manual parameter tuning, inclusive)}
|
|
|
|
\item{es_pwd}{Password for Elasticsearch read access}
|
|
}
|
|
\value{
|
|
dupe_objects.json and data frame containing each id and all its duplicates. remove_ids.txt and character vector with list of ids to be removed. Files are in current working directory
|
|
}
|
|
\description{
|
|
Get ids of duplicate documents that have a cosine similarity score higher than [threshold]
|
|
}
|
|
\examples{
|
|
dupe_detect(1,grid,cutoff_lower, cutoff_upper = 1, es_pwd)
|
|
}
|