Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 22 additions & 40 deletions eland/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import copy
import warnings
from collections import defaultdict
from datetime import datetime
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -39,6 +40,7 @@
from eland.common import (
DEFAULT_PAGINATION_SIZE,
DEFAULT_PIT_KEEP_ALIVE,
DEFAULT_PROGRESS_REPORTING_NUM_ROWS,
DEFAULT_SEARCH_SIZE,
SortOrder,
build_pd_series,
Expand Down Expand Up @@ -1198,18 +1200,34 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame:
def to_pandas(
self, query_compiler: "QueryCompiler", show_progress: bool = False
) -> pd.DataFrame:
df = self._es_results(query_compiler, show_progress)

return df
df_list: List[pd.DataFrame] = []
i = 0
for df in self.search_yield_pandas_dataframes(query_compiler=query_compiler):
if show_progress:
i = i + df.shape[0]
if i % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
print(f"{datetime.now()}: read {i} rows")
df_list.append(df)

if show_progress:
print(f"{datetime.now()}: read {i} rows")

# pd.concat() can't handle an empty list
# because there aren't defined columns.
if not df_list:
return query_compiler._empty_pd_ef()
return pd.concat(df_list)

def to_csv(
self,
query_compiler: "QueryCompiler",
show_progress: bool = False,
**kwargs: Union[bool, str],
) -> Optional[str]:
df = self._es_results(query_compiler, show_progress)
return df.to_csv(**kwargs) # type: ignore[no-any-return]
return self.to_pandas( # type: ignore[no-any-return]
query_compiler=query_compiler, show_progress=show_progress
).to_csv(**kwargs)

def search_yield_pandas_dataframes(
self, query_compiler: "QueryCompiler"
Expand Down Expand Up @@ -1241,42 +1259,6 @@ def search_yield_pandas_dataframes(
df = self._apply_df_post_processing(df, post_processing)
yield df

def _es_results(
self, query_compiler: "QueryCompiler", show_progress: bool = False
) -> pd.DataFrame:
query_params, post_processing = self._resolve_tasks(query_compiler)

result_size, sort_params = Operations._query_params_to_size_and_sort(
query_params
)

script_fields = query_params.script_fields
query = Query(query_params.query)

body = query.to_search_body()
if script_fields is not None:
body["script_fields"] = script_fields

# Only return requested field_names and add them to body
_source = query_compiler.get_field_names(include_scripted_fields=False)
body["_source"] = _source if _source else False

if sort_params:
body["sort"] = [sort_params]

es_results: List[Dict[str, Any]] = sum(
_search_yield_hits(
query_compiler=query_compiler, body=body, max_number_of_hits=result_size
),
[],
)

df = query_compiler._es_results_to_pandas(
results=es_results, show_progress=show_progress
)
df = self._apply_df_post_processing(df, post_processing)
return df

def index_count(self, query_compiler: "QueryCompiler", field: str) -> int:
# field is the index field so count values
query_params, post_processing = self._resolve_tasks(query_compiler)
Expand Down
17 changes: 2 additions & 15 deletions eland/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# under the License.

import copy
from datetime import datetime
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -33,11 +32,7 @@
import numpy as np
import pandas as pd # type: ignore

from eland.common import (
DEFAULT_PROGRESS_REPORTING_NUM_ROWS,
elasticsearch_date_to_pandas_date,
ensure_es_client,
)
from eland.common import elasticsearch_date_to_pandas_date, ensure_es_client
from eland.field_mappings import FieldMappings
from eland.filter import BooleanFilter, QueryFilter
from eland.index import Index
Expand Down Expand Up @@ -149,7 +144,6 @@ def es_dtypes(self) -> pd.Series:
def _es_results_to_pandas(
self,
results: List[Dict[str, Any]],
show_progress: bool = False,
) -> "pd.Dataframe":
"""
Parameters
Expand Down Expand Up @@ -274,10 +268,6 @@ def _es_results_to_pandas(
# flatten row to map correctly to 2D DataFrame
rows.append(self._flatten_dict(row, field_mapping_cache))

if show_progress:
if i % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0:
print(f"{datetime.now()}: read {i} rows")

# Create pandas DataFrame
df = pd.DataFrame(data=rows, index=index)

Expand All @@ -299,9 +289,6 @@ def _es_results_to_pandas(
if len(self.columns) > 1:
df = df[self.columns]

if show_progress:
print(f"{datetime.now()}: read {i} rows")

return df

def _flatten_dict(self, y, field_mapping_cache: "FieldMappingCache"):
Expand Down Expand Up @@ -383,7 +370,7 @@ def _index_matches_count(self, items: List[Any]) -> int:
self, self.index.es_index_field, items
)

def _empty_pd_ef(self):
def _empty_pd_ef(self) -> "pd.DataFrame":
# Return an empty dataframe with correct columns and dtypes
df = pd.DataFrame()
for c, d in zip(self.dtypes.index, self.dtypes.values):
Expand Down