0

I have set of large tables stored as parquets on s3. In python, I'm using:

pd.read_parquet(...,columns=columns)

I'm reading the files directly from s3, without any database engine whatsoever for preprocessing in between.

My question is, will the columns argument allow me to reduce data transfer to my remote dask cluster workers by just specifying a subset of the columns I'm interested in, or will they load the full parquet, at first, and then extract the columns? I suspect the latter is the case.

Looking for a solution, I found S3 select: https://docs.aws.amazon.com/AmazonS3/latest/userguide/selecting-content-from-objects.html

I think, I could use boto3 and sql syntax to read subsets of columns directly on s3, similar as done here: https://www.msp360.com/resources/blog/how-to-use-s3-select-feature-amazon/

But what I would really like to have is a version of pd.read_parquet that does this in the background. What I found here is the awswrangler library: https://aws-sdk-pandas.readthedocs.io/en/stable/stubs/awswrangler.s3.read_parquet.html

I suspect that awswrangler does exactly what I want, but I did not find an example that shows this. Does anybody know how it works?

Thanks!

1 Answers1

0

Here's what I do using a custom defined read_parquets. It seems to reduce the data transfer size.

df1 = read_parquets(s3_path)

Time taken 4m 8.0s

df2 = pd.read_parquet(s3_path, columns = ['col1'])

Time taken 25.0s

df3 = read_parquets(s3_path, columns = ['col1'])

Time taken 15.2s

Supporting functions:

from functools import lru_cache
import os
import re

import boto3 import boto3 from botocore.errorfactory import ClientError import json

import logging _logs_file_ops = logging.getLogger(name)

s3 = boto3.client('s3') s3_resource = boto3.resource('s3')

def list_distribute_into_blocks(input_list: list, num_lists: int = 0) -> list[list]: """Equally distributes an input list in to a list of specified number of lists

Args:
    input_list (list): A python flat list
    block_size (int): Size of blocks
"""
if num_lists == 0:
    return input_list
try:
    list_length = len(input_list)
    sublist_size = list_length // num_lists  # Calculate the size of each sublist
    remaining_elements = list_length % num_lists  # Calculate the number of remaining elements

    sublists = []
    index = 0

    for i in range(num_lists):
        sublist = input_list[index: index + sublist_size]
        index += sublist_size
        # If there are remaining elements, distribute them across sublists
        if remaining_elements > 0:
            sublist.append(input_list[index])
            index += 1
            remaining_elements -= 1

        sublists.append(sublist)

    sublists = [_ for _ in sublists if len(_) > 0]
    return sublists
except Exception as e:
    _logs_common_func.error(f'Distribution to list of {num_lists} lists failed due to: {e}')
    return input_list

def listdir(folder_path): """Returns a list of full paths to the contents of a specified location. Also accepts s3 locations """ if 's3://' in folder_path: folder_path = folder_path.replace('\', '/') # create S3 client # specify bucket and prefix (folder path) # bucket_name = 'boulevarddataplatform' bucket_name, prefix = _bucket_prefix(folder_path) if prefix[-1] != '/': prefix += '/'

    keys = []
    kwargs = dict(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
    while True:
        resp = s3.list_objects_v2(**kwargs)
        files, folders = resp.get('Contents', []), resp.get('CommonPrefixes', [])
        for obj in files:
            keys.append(obj['Key'])
        for obj in folders:
            keys.append(obj['Prefix'])
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

    _logs_file_ops.info(f'{len(files)} files + {len(folders)} folders in {bucket_name}/{prefix}')
    return [f's3://{bucket_name}/{_}' for _ in keys]
else:
    return [os.path.join(folder_path, _) for _ in os.listdir(folder_path)]

Function to read parquets in parallel. You can pass a s3 or local disk directory from which to read parquet files:

def read_parquets(
        path:str | list[str], columns:list[str] | None = None,
        threads:int=10, max_retries:int = 5, replace_npnan:bool = False
    ):
    """Reads the specified path (s3 or local) as a pandas dataframe
Args:
    path (str | list[str]): A folder or a list of parquet files
    columns (list[str] | None, optional): Specify columns to be read from the parquet file
    threads (int, optional): Number of parallel threads reading. Do NOT exceed 10
    max_retries (int, optional): Max times to retry
    replace_npnan (bool, optional): Replace nan values?

Returns:
    pandas.DataFrame: A pandas dataframe with all the parquet files concatenated
"""
import pandas as pd
import time
import random
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from app.utils.common_functions import list_distribute_into_blocks
import pyarrow.parquet as pq

def _read_parquet_file(file_path, columns=None, retry_number = 0):
    """Read a single parquet file
    """
    if retry_number >= max_retries:
        _logs_file_ops.error(f'Failed for {file_path} after {retry_number} retries')
        return None
    try:
        parquet_file  = pq.ParquetFile(file_path)
        if parquet_file.metadata.num_rows == 0:
            return None
        if columns is not None:
            columns = [_ for _ in columns if _ in parquet_file.schema.names]
        return (parquet_file.read(columns=columns).to_pandas())
    except Exception as e:
        _logs_file_ops.warning(f'Retrying: {file_path}: {e}')
        # Full jitter wait
        temp = min(10, 2**retry_number)
        wait_time = temp/2 + random.uniform(0, temp)
        time.sleep(wait_time)
        # Retry
        return _read_parquet_file(file_path, columns, retry_number+1)

def _read_list_of_parquet_files(file_list: list[str], columns=None):
    """Read a list of parquet files serially using `_read_parquet_file`
    """
    df = pd.concat([
            _read_parquet_file(file_path=f, columns=columns)
            for f in tqdm(file_list, desc = 'Reading...', leave = False)
        ])
    if replace_npnan is False:
        return df
    else:
        _logs_file_ops.warning(f'[!] Replace np.nan with {replace_npnan}')
        return df.replace(np.nan, replace_npnan)

if isinstance(path, str):
    if '.parquet' in path:
        return pd.read_parquet(path=path, columns=columns)
    else:
        list_of_files = [_ for _ in listdir(path) if '.parquet' in _]
elif isinstance(path, list):
    list_of_files = path

if len(list_of_files) == 0:
    _logs_file_ops.warning(f'No .parquet files found in {path}')
    return None

file_list_of_lists = list_distribute_into_blocks(list_of_files, threads)
with ThreadPoolExecutor(max_workers=threads) as executor:
    futures = [
        executor.submit(_read_list_of_parquet_files, file_list, columns)
        for file_list in file_list_of_lists
    ]
    sublists = [future.result() for future in futures]

return pd.concat(sublists)