|
18 | 18 | import copy |
19 | 19 | import warnings |
20 | 20 | from collections import defaultdict |
| 21 | +from datetime import datetime |
21 | 22 | from typing import ( |
22 | 23 | TYPE_CHECKING, |
23 | 24 | Any, |
|
39 | 40 | from eland.common import ( |
40 | 41 | DEFAULT_PAGINATION_SIZE, |
41 | 42 | DEFAULT_PIT_KEEP_ALIVE, |
| 43 | + DEFAULT_PROGRESS_REPORTING_NUM_ROWS, |
42 | 44 | DEFAULT_SEARCH_SIZE, |
43 | 45 | SortOrder, |
44 | 46 | build_pd_series, |
@@ -1198,18 +1200,34 @@ def describe(self, query_compiler: "QueryCompiler") -> pd.DataFrame: |
1198 | 1200 | def to_pandas( |
1199 | 1201 | self, query_compiler: "QueryCompiler", show_progress: bool = False |
1200 | 1202 | ) -> pd.DataFrame: |
1201 | | - df = self._es_results(query_compiler, show_progress) |
1202 | 1203 |
|
1203 | | - return df |
| 1204 | + df_list: List[pd.DataFrame] = [] |
| 1205 | + i = 0 |
| 1206 | + for df in self.search_yield_pandas_dataframes(query_compiler=query_compiler): |
| 1207 | + if show_progress: |
| 1208 | + i = i + df.shape[0] |
| 1209 | + if i % DEFAULT_PROGRESS_REPORTING_NUM_ROWS == 0: |
| 1210 | + print(f"{datetime.now()}: read {i} rows") |
| 1211 | + df_list.append(df) |
| 1212 | + |
| 1213 | + if show_progress: |
| 1214 | + print(f"{datetime.now()}: read {i} rows") |
| 1215 | + |
| 1216 | + # pd.concat() can't handle an empty list |
| 1217 | + # because there aren't defined columns. |
| 1218 | + if not df_list: |
| 1219 | + return query_compiler._empty_pd_ef() |
| 1220 | + return pd.concat(df_list) |
1204 | 1221 |
|
1205 | 1222 | def to_csv( |
1206 | 1223 | self, |
1207 | 1224 | query_compiler: "QueryCompiler", |
1208 | 1225 | show_progress: bool = False, |
1209 | 1226 | **kwargs: Union[bool, str], |
1210 | 1227 | ) -> Optional[str]: |
1211 | | - df = self._es_results(query_compiler, show_progress) |
1212 | | - return df.to_csv(**kwargs) # type: ignore[no-any-return] |
| 1228 | + return self.to_pandas( # type: ignore[no-any-return] |
| 1229 | + query_compiler=query_compiler, show_progress=show_progress |
| 1230 | + ).to_csv(**kwargs) |
1213 | 1231 |
|
1214 | 1232 | def search_yield_pandas_dataframes( |
1215 | 1233 | self, query_compiler: "QueryCompiler" |
@@ -1241,42 +1259,6 @@ def search_yield_pandas_dataframes( |
1241 | 1259 | df = self._apply_df_post_processing(df, post_processing) |
1242 | 1260 | yield df |
1243 | 1261 |
|
1244 | | - def _es_results( |
1245 | | - self, query_compiler: "QueryCompiler", show_progress: bool = False |
1246 | | - ) -> pd.DataFrame: |
1247 | | - query_params, post_processing = self._resolve_tasks(query_compiler) |
1248 | | - |
1249 | | - result_size, sort_params = Operations._query_params_to_size_and_sort( |
1250 | | - query_params |
1251 | | - ) |
1252 | | - |
1253 | | - script_fields = query_params.script_fields |
1254 | | - query = Query(query_params.query) |
1255 | | - |
1256 | | - body = query.to_search_body() |
1257 | | - if script_fields is not None: |
1258 | | - body["script_fields"] = script_fields |
1259 | | - |
1260 | | - # Only return requested field_names and add them to body |
1261 | | - _source = query_compiler.get_field_names(include_scripted_fields=False) |
1262 | | - body["_source"] = _source if _source else False |
1263 | | - |
1264 | | - if sort_params: |
1265 | | - body["sort"] = [sort_params] |
1266 | | - |
1267 | | - es_results: List[Dict[str, Any]] = sum( |
1268 | | - _search_yield_hits( |
1269 | | - query_compiler=query_compiler, body=body, max_number_of_hits=result_size |
1270 | | - ), |
1271 | | - [], |
1272 | | - ) |
1273 | | - |
1274 | | - df = query_compiler._es_results_to_pandas( |
1275 | | - results=es_results, show_progress=show_progress |
1276 | | - ) |
1277 | | - df = self._apply_df_post_processing(df, post_processing) |
1278 | | - return df |
1279 | | - |
1280 | 1262 | def index_count(self, query_compiler: "QueryCompiler", field: str) -> int: |
1281 | 1263 | # field is the index field so count values |
1282 | 1264 | query_params, post_processing = self._resolve_tasks(query_compiler) |
|
0 commit comments