|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
3 | 3 | from abc import ABC
|
4 |
| -import threading |
5 |
| -from typing import Dict, List, Optional, Tuple, Union, TYPE_CHECKING |
| 4 | +from typing import List, Optional, Tuple, Union, TYPE_CHECKING |
6 | 5 |
|
7 | 6 | from databricks.sql.cloudfetch.download_manager import ResultFileDownloadManager
|
8 | 7 | from databricks.sql.telemetry.models.enums import StatementType
|
@@ -123,179 +122,6 @@ def close(self):
|
123 | 122 | return
|
124 | 123 |
|
125 | 124 |
|
126 |
| -class LinkFetcher: |
127 |
| - """ |
128 |
| - Background helper that incrementally retrieves *external links* for a |
129 |
| - result set produced by the SEA backend and feeds them to a |
130 |
| - :class:`databricks.sql.cloudfetch.download_manager.ResultFileDownloadManager`. |
131 |
| -
|
132 |
| - The SEA backend splits large result sets into *chunks*. Each chunk is |
133 |
| - stored remotely (e.g., in object storage) and exposed via a signed URL |
134 |
| - encapsulated by an :class:`ExternalLink`. Only the first batch of links is |
135 |
| - returned with the initial query response. The remaining links must be |
136 |
| - pulled on demand using the *next-chunk* token embedded in each |
137 |
| - :pyattr:`ExternalLink.next_chunk_index`. |
138 |
| -
|
139 |
| - LinkFetcher takes care of this choreography so callers (primarily |
140 |
| - ``SeaCloudFetchQueue``) can simply ask for the link of a specific |
141 |
| - ``chunk_index`` and block until it becomes available. |
142 |
| -
|
143 |
| - Key responsibilities: |
144 |
| -
|
145 |
| - • Maintain an in-memory mapping from ``chunk_index`` → ``ExternalLink``. |
146 |
| - • Launch a background worker thread that continuously requests the next |
147 |
| - batch of links from the backend until all chunks have been discovered or |
148 |
| - an unrecoverable error occurs. |
149 |
| - • Bridge SEA link objects to the Thrift representation expected by the |
150 |
| - existing download manager. |
151 |
| - • Provide a synchronous API (`get_chunk_link`) that blocks until the desired |
152 |
| - link is present in the cache. |
153 |
| - """ |
154 |
| - |
155 |
| - def __init__( |
156 |
| - self, |
157 |
| - download_manager: ResultFileDownloadManager, |
158 |
| - backend: SeaDatabricksClient, |
159 |
| - statement_id: str, |
160 |
| - initial_links: List[ExternalLink], |
161 |
| - total_chunk_count: int, |
162 |
| - ): |
163 |
| - self.download_manager = download_manager |
164 |
| - self.backend = backend |
165 |
| - self._statement_id = statement_id |
166 |
| - |
167 |
| - self._shutdown_event = threading.Event() |
168 |
| - |
169 |
| - self._link_data_update = threading.Condition() |
170 |
| - self._error: Optional[Exception] = None |
171 |
| - self.chunk_index_to_link: Dict[int, ExternalLink] = {} |
172 |
| - |
173 |
| - self._add_links(initial_links) |
174 |
| - self.total_chunk_count = total_chunk_count |
175 |
| - |
176 |
| - # DEBUG: capture initial state for observability |
177 |
| - logger.debug( |
178 |
| - "LinkFetcher[%s]: initialized with %d initial link(s); expecting %d total chunk(s)", |
179 |
| - statement_id, |
180 |
| - len(initial_links), |
181 |
| - total_chunk_count, |
182 |
| - ) |
183 |
| - |
184 |
| - def _add_links(self, links: List[ExternalLink]): |
185 |
| - """Cache *links* locally and enqueue them with the download manager.""" |
186 |
| - logger.debug( |
187 |
| - "LinkFetcher[%s]: caching %d link(s) – chunks %s", |
188 |
| - self._statement_id, |
189 |
| - len(links), |
190 |
| - ", ".join(str(l.chunk_index) for l in links) if links else "<none>", |
191 |
| - ) |
192 |
| - for link in links: |
193 |
| - self.chunk_index_to_link[link.chunk_index] = link |
194 |
| - self.download_manager.add_link(LinkFetcher._convert_to_thrift_link(link)) |
195 |
| - |
196 |
| - def _get_next_chunk_index(self) -> Optional[int]: |
197 |
| - """Return the next *chunk_index* that should be requested from the backend, or ``None`` if we have them all.""" |
198 |
| - with self._link_data_update: |
199 |
| - max_chunk_index = max(self.chunk_index_to_link.keys(), default=None) |
200 |
| - if max_chunk_index is None: |
201 |
| - return 0 |
202 |
| - max_link = self.chunk_index_to_link[max_chunk_index] |
203 |
| - return max_link.next_chunk_index |
204 |
| - |
205 |
| - def _trigger_next_batch_download(self) -> bool: |
206 |
| - """Fetch the next batch of links from the backend and return *True* on success.""" |
207 |
| - logger.debug( |
208 |
| - "LinkFetcher[%s]: requesting next batch of links", self._statement_id |
209 |
| - ) |
210 |
| - next_chunk_index = self._get_next_chunk_index() |
211 |
| - if next_chunk_index is None: |
212 |
| - return False |
213 |
| - |
214 |
| - try: |
215 |
| - links = self.backend.get_chunk_links(self._statement_id, next_chunk_index) |
216 |
| - with self._link_data_update: |
217 |
| - self._add_links(links) |
218 |
| - self._link_data_update.notify_all() |
219 |
| - except Exception as e: |
220 |
| - logger.error( |
221 |
| - f"LinkFetcher: Error fetching links for chunk {next_chunk_index}: {e}" |
222 |
| - ) |
223 |
| - with self._link_data_update: |
224 |
| - self._error = e |
225 |
| - self._link_data_update.notify_all() |
226 |
| - return False |
227 |
| - |
228 |
| - logger.debug( |
229 |
| - "LinkFetcher[%s]: received %d new link(s)", |
230 |
| - self._statement_id, |
231 |
| - len(links), |
232 |
| - ) |
233 |
| - return True |
234 |
| - |
235 |
| - def get_chunk_link(self, chunk_index: int) -> Optional[ExternalLink]: |
236 |
| - """Return (blocking) the :class:`ExternalLink` associated with *chunk_index*.""" |
237 |
| - logger.debug( |
238 |
| - "LinkFetcher[%s]: waiting for link of chunk %d", |
239 |
| - self._statement_id, |
240 |
| - chunk_index, |
241 |
| - ) |
242 |
| - if chunk_index >= self.total_chunk_count: |
243 |
| - return None |
244 |
| - |
245 |
| - with self._link_data_update: |
246 |
| - while chunk_index not in self.chunk_index_to_link: |
247 |
| - if self._error: |
248 |
| - raise self._error |
249 |
| - if self._shutdown_event.is_set(): |
250 |
| - raise ProgrammingError( |
251 |
| - "LinkFetcher is shutting down without providing link for chunk index {}".format( |
252 |
| - chunk_index |
253 |
| - ) |
254 |
| - ) |
255 |
| - self._link_data_update.wait() |
256 |
| - |
257 |
| - return self.chunk_index_to_link[chunk_index] |
258 |
| - |
259 |
| - @staticmethod |
260 |
| - def _convert_to_thrift_link(link: ExternalLink) -> TSparkArrowResultLink: |
261 |
| - """Convert SEA external links to Thrift format for compatibility with existing download manager.""" |
262 |
| - # Parse the ISO format expiration time |
263 |
| - expiry_time = int(dateutil.parser.parse(link.expiration).timestamp()) |
264 |
| - return TSparkArrowResultLink( |
265 |
| - fileLink=link.external_link, |
266 |
| - expiryTime=expiry_time, |
267 |
| - rowCount=link.row_count, |
268 |
| - bytesNum=link.byte_count, |
269 |
| - startRowOffset=link.row_offset, |
270 |
| - httpHeaders=link.http_headers or {}, |
271 |
| - ) |
272 |
| - |
273 |
| - def _worker_loop(self): |
274 |
| - """Entry point for the background thread.""" |
275 |
| - logger.debug("LinkFetcher[%s]: worker thread started", self._statement_id) |
276 |
| - while not self._shutdown_event.is_set(): |
277 |
| - links_downloaded = self._trigger_next_batch_download() |
278 |
| - if not links_downloaded: |
279 |
| - self._shutdown_event.set() |
280 |
| - logger.debug("LinkFetcher[%s]: worker thread exiting", self._statement_id) |
281 |
| - self._link_data_update.notify_all() |
282 |
| - |
283 |
| - def start(self): |
284 |
| - """Spawn the worker thread.""" |
285 |
| - logger.debug("LinkFetcher[%s]: starting worker thread", self._statement_id) |
286 |
| - self._worker_thread = threading.Thread( |
287 |
| - target=self._worker_loop, name=f"LinkFetcher-{self._statement_id}" |
288 |
| - ) |
289 |
| - self._worker_thread.start() |
290 |
| - |
291 |
| - def stop(self): |
292 |
| - """Signal the worker thread to stop and wait for its termination.""" |
293 |
| - logger.debug("LinkFetcher[%s]: stopping worker thread", self._statement_id) |
294 |
| - self._shutdown_event.set() |
295 |
| - self._worker_thread.join() |
296 |
| - logger.debug("LinkFetcher[%s]: worker thread stopped", self._statement_id) |
297 |
| - |
298 |
| - |
299 | 125 | class SeaCloudFetchQueue(CloudFetchQueue):
|
300 | 126 | """Queue implementation for EXTERNAL_LINKS disposition with ARROW format for SEA backend."""
|
301 | 127 |
|
@@ -337,49 +163,80 @@ def __init__(
|
337 | 163 | chunk_id=0,
|
338 | 164 | )
|
339 | 165 |
|
| 166 | + self._sea_client = sea_client |
| 167 | + self._statement_id = statement_id |
| 168 | + self._total_chunk_count = total_chunk_count |
| 169 | + |
340 | 170 | logger.debug(
|
341 | 171 | "SeaCloudFetchQueue: Initialize CloudFetch loader for statement {}, total chunks: {}".format(
|
342 | 172 | statement_id, total_chunk_count
|
343 | 173 | )
|
344 | 174 | )
|
345 | 175 |
|
346 | 176 | initial_links = result_data.external_links or []
|
| 177 | + self._chunk_index_to_link = {link.chunk_index: link for link in initial_links} |
347 | 178 |
|
348 | 179 | # Track the current chunk we're processing
|
349 | 180 | self._current_chunk_index = 0
|
| 181 | + first_link = self._chunk_index_to_link.get(self._current_chunk_index, None) |
| 182 | + if not first_link: |
| 183 | + # possibly an empty response |
| 184 | + return None |
350 | 185 |
|
351 |
| - self.link_fetcher = None # for empty responses, we do not need a link fetcher |
352 |
| - if total_chunk_count > 0: |
353 |
| - self.link_fetcher = LinkFetcher( |
354 |
| - download_manager=self.download_manager, |
355 |
| - backend=sea_client, |
356 |
| - statement_id=statement_id, |
357 |
| - initial_links=initial_links, |
358 |
| - total_chunk_count=total_chunk_count, |
359 |
| - ) |
360 |
| - self.link_fetcher.start() |
361 |
| - |
| 186 | + # Track the current chunk we're processing |
| 187 | + self._current_chunk_index = 0 |
362 | 188 | # Initialize table and position
|
363 |
| - self.table = self._create_next_table() |
| 189 | + self.table = self._create_table_from_link(first_link) |
364 | 190 |
|
365 |
| - def _create_next_table(self) -> Union["pyarrow.Table", None]: |
366 |
| - """Create next table by retrieving the logical next downloaded file.""" |
367 |
| - if self.link_fetcher is None: |
368 |
| - return None |
| 191 | + def _convert_to_thrift_link(self, link: ExternalLink) -> TSparkArrowResultLink: |
| 192 | + """Convert SEA external links to Thrift format for compatibility with existing download manager.""" |
| 193 | + # Parse the ISO format expiration time |
| 194 | + expiry_time = int(dateutil.parser.parse(link.expiration).timestamp()) |
| 195 | + return TSparkArrowResultLink( |
| 196 | + fileLink=link.external_link, |
| 197 | + expiryTime=expiry_time, |
| 198 | + rowCount=link.row_count, |
| 199 | + bytesNum=link.byte_count, |
| 200 | + startRowOffset=link.row_offset, |
| 201 | + httpHeaders=link.http_headers or {}, |
| 202 | + ) |
369 | 203 |
|
370 |
| - chunk_link = self.link_fetcher.get_chunk_link(self._current_chunk_index) |
371 |
| - if chunk_link is None: |
| 204 | + def _get_chunk_link(self, chunk_index: int) -> Optional["ExternalLink"]: |
| 205 | + if chunk_index >= self._total_chunk_count: |
372 | 206 | return None
|
373 | 207 |
|
374 |
| - row_offset = chunk_link.row_offset |
375 |
| - # NOTE: link has already been submitted to download manager at this point |
376 |
| - arrow_table = self._create_table_at_offset(row_offset) |
| 208 | + if chunk_index not in self._chunk_index_to_link: |
| 209 | + links = self._sea_client.get_chunk_links(self._statement_id, chunk_index) |
| 210 | + self._chunk_index_to_link.update({l.chunk_index: l for l in links}) |
| 211 | + |
| 212 | + link = self._chunk_index_to_link.get(chunk_index, None) |
| 213 | + if not link: |
| 214 | + raise ServerOperationError( |
| 215 | + f"Error fetching link for chunk {chunk_index}", |
| 216 | + { |
| 217 | + "operation-id": self._statement_id, |
| 218 | + "diagnostic-info": None, |
| 219 | + }, |
| 220 | + ) |
| 221 | + return link |
377 | 222 |
|
378 |
| - self._current_chunk_index += 1 |
| 223 | + def _create_table_from_link( |
| 224 | + self, link: ExternalLink |
| 225 | + ) -> Union["pyarrow.Table", None]: |
| 226 | + """Create a table from a link.""" |
| 227 | + |
| 228 | + thrift_link = self._convert_to_thrift_link(link) |
| 229 | + self.download_manager.add_link(thrift_link) |
| 230 | + |
| 231 | + row_offset = link.row_offset |
| 232 | + arrow_table = self._create_table_at_offset(row_offset) |
379 | 233 |
|
380 | 234 | return arrow_table
|
381 | 235 |
|
382 |
| - def close(self): |
383 |
| - super().close() |
384 |
| - if self.link_fetcher: |
385 |
| - self.link_fetcher.stop() |
| 236 | + def _create_next_table(self) -> Union["pyarrow.Table", None]: |
| 237 | + """Create next table by retrieving the logical next downloaded file.""" |
| 238 | + self._current_chunk_index += 1 |
| 239 | + next_chunk_link = self._get_chunk_link(self._current_chunk_index) |
| 240 | + if not next_chunk_link: |
| 241 | + return None |
| 242 | + return self._create_table_from_link(next_chunk_link) |
0 commit comments