WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit bfedb5b

Browse files
author
Austin Dickey
authored
Merge pull request #22 from mfrasco/feature/clarify_max_hits
clarified documentation for max_hits default parameter and then chang…
2 parents 32a757a + 985789e commit bfedb5b

File tree

2 files changed

+15
-15
lines changed

2 files changed

+15
-15
lines changed

r-pkg/R/elasticsearch_parsers.R

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,9 @@ chomp_hits <- function(hits_json = NULL, keep_nested_data_cols = TRUE) {
548548
#' @param es_host A string identifying an Elasticsearch host. This should be of the form
549549
#' \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}.
550550
#' @param es_index The name of an Elasticsearch index to be queried.
551-
#' @param max_hits Integer. If specified, \code{es_search} will stop pulling data as soon as it has pulled this many hits.
551+
#' @param max_hits Integer. If specified, \code{es_search} will stop pulling data as soon
552+
#' as it has pulled this many hits. Default is \code{Inf}, meaning that
553+
#' all possible hits will be pulled.
552554
#' @param size Number of records per page of results. See \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-from-size.html}{Elasticsearch docs} for more.
553555
#' Note that this will be reset to 0 if you submit a \code{query_body} with
554556
#' an "aggs" request in it. Also see \code{max_hits}.
@@ -617,7 +619,7 @@ es_search <- function(es_host
617619
, size = 10000
618620
, query_body = '{}'
619621
, scroll = "5m"
620-
, max_hits = NULL
622+
, max_hits = Inf
621623
, n_cores = ceiling(parallel::detectCores()/2)
622624
, break_on_duplicates = TRUE
623625
, ignore_scroll_restriction = FALSE
@@ -680,7 +682,9 @@ es_search <- function(es_host
680682
# time you expect to pass between requests. See the
681683
# \href{https://www.elastic.co/guide/en/Elasticsearch/guide/current/scroll.html}{Elasticsearch scroll/pagination docs}
682684
# for more information.
683-
# [param] max_hits Integer. If specified, \code{.fetch_all} will stop pulling data as soon as it passes this threshold.
685+
# [param] max_hits Integer. If specified, \code{es_search} will stop pulling data as soon
686+
# as it has pulled this many hits. Default is \code{Inf}, meaning that
687+
# all possible hits will be pulled.
684688
# [param] n_cores Number of cores to distribute fetching + processing over.
685689
# [param] break_on_duplicates Boolean, defaults to TRUE. \code{.fetch_all} uses the size of the final object it returns
686690
# to check whether or not some data were lost during the processing.
@@ -727,7 +731,7 @@ es_search <- function(es_host
727731
, size = 10000
728732
, query_body = '{}'
729733
, scroll = "5m"
730-
, max_hits = NULL
734+
, max_hits = Inf
731735
, n_cores = ceiling(parallel::detectCores()/2)
732736
, break_on_duplicates = TRUE
733737
, ignore_scroll_restriction = FALSE
@@ -756,7 +760,7 @@ es_search <- function(es_host
756760
# requesting more hits than you get is not costless:
757761
# - ES allocates a temporary data structure of size <size>
758762
# - you end up transmitting more data over the wire than the user wants
759-
if (!is.null(max_hits) && max_hits < size){
763+
if (max_hits < size) {
760764
msg <- paste0(sprintf("You requested a maximum of %s hits", max_hits),
761765
sprintf(" and a page size of %s.", size),
762766
sprintf(" Resetting size to %s for efficiency.", max_hits))
@@ -767,7 +771,7 @@ es_search <- function(es_host
767771
}
768772

769773
# Warn if you are gonna give back a few more hits than max_hits
770-
if (!is.null(max_hits) && max_hits %% size != 0){
774+
if (!is.infinite(max_hits) && max_hits %% size != 0) {
771775
msg <- paste0("When max_hits is not an exact multiple of size, it is ",
772776
"possible to get a few more than max_hits results back.")
773777
futile.logger::flog.warn(msg)
@@ -925,7 +929,7 @@ es_search <- function(es_host
925929
# - returns the first page + a scroll_id which uniquely identifies the stack
926930
# [params] scroll_id - a unique key identifying the search context
927931
# out_path - A file path to write temporary output to. Passed in from .fetch_all
928-
# max_hits - max_hits, comes from .fetch_all. If left as NULL in your call to
932+
# max_hits - max_hits, comes from .fetch_all. If left as Inf in your call to
929933
# .fetch_all, this param has no influence and you will pull all the data.
930934
# otherwise, this is used to limit the result size.
931935
# scroll_url - Elasticsearch URL to hit to get the next page of data
@@ -947,12 +951,6 @@ es_search <- function(es_host
947951
, hits_to_pull
948952
){
949953

950-
# Deal with case where user tries to say "don't limit me" by setting
951-
# max_hits = NULL explicitly
952-
if (is.null(max_hits)){
953-
max_hits <- Inf
954-
}
955-
956954
while (hits_pulled < max_hits){
957955

958956
# Grab a page of hits, break if we got back an error

r-pkg/man/es_search.Rd

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)