You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/04/27 06:22:24 UTC
[GitHub] [arrow] JiaRu2016 commented on issue #10138: feather read a part of columns slower than read the entire file
JiaRu2016 commented on issue #10138:
URL: https://github.com/apache/arrow/issues/10138#issuecomment-827345634
here is a reproducing script and output. Here the output indicates that reading 50% of the columns or more do not save any time, so speed measurement (GB/s) decrease as number of columns to read decrease.
In theory, what will affect reading speed? The only factor I could thought of is continuity of columns, ie. maybe reading columns [1 2 3 4 ... 10] faster than random picked 10 columns like [42, 24, 15, 9 ...]? Any document related to on-disk-storage of feather format?
reproducing scritpt:
```python
import numpy as np
import pandas as pd
import os
import time
import tqdm
n, m = 10_0000, 135
dtype = np.float64 # 100_0000 * 135 * 8bytes = 1.005 GB per file
nbytes_per_file = n * m * np.finfo(dtype).bits / 8
nfiles = 10
#output_dir = '/dev/shm/io_speed_test_feather_cols/' # /dev/shm
output_dir = '/auto/rjia/io_speed_test_feather_cols/' # local disk
#output_dir = '/cpfs/user/rjia/io_speed_test_feather_cols/' # network file system
def setup():
import shutil
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
print(f'rm existing dir: {output_dir}')
os.makedirs(output_dir)
print(f'hostname: {os.uname()[1]}')
print(f'# files = {nfiles}, bytes per file = {nbytes_per_file / 1024**2} MB')
def generate_files():
for i in tqdm.tqdm(range(nfiles), desc='generate_files'):
arr = np.random.normal(size=(n, m)).astype(dtype)
df = pd.DataFrame(arr, columns=[f'x{j}' for j in range(m)])
df.to_feather(f'{output_dir}/{i}.feather')
def summary(tag, seconds_list, nbytes_per_file):
speed_GBs = [nbytes_per_file / 1024**3 / seconds for seconds in seconds_list]
mean = np.mean(speed_GBs)
std = np.std(speed_GBs)
n = len(seconds_list)
print(f'[ {tag} ] speed (GB/s): mean = {mean}, std = {std}, n = {n}; total time_elapse: {sum(seconds_list)}')
def main():
for cols in [1.0, 0.8, 0.6, 0.4, 0.2]:
generate_files() # genreate new files each time run, to avoid cache
if cols == 1.0:
columns = None
else:
columns = sorted(np.random.choice(m, size=int(cols*m), replace=False).tolist())
#print(f'columns len: {columns.__len__()}')
elapse_lst = []
for i in tqdm.tqdm(range(nfiles), desc=f'read_{cols}'):
t0 = time.perf_counter()
df = pd.read_feather(f'{output_dir}/{i}.feather', columns=columns)
t1 = time.perf_counter()
elapse = t1 - t0
elapse_lst.append(elapse)
summary(f'read_{cols}', elapse_lst, nbytes_per_file * cols)
if __name__ == "__main__":
setup()
main()
```
otuput (filter out progress bars)
```
# # files = 10, bytes per file = 102.996826171875 MB
# [ read_1.0 ] speed (GB/s): mean = 1.0219851767027053, std = 0.03415322016211257, n = 10; total time_elapse: 0.9852561568841338
# [ read_0.8 ] speed (GB/s): mean = 0.7865783976845142, std = 0.07702083870020748, n = 10; total time_elapse: 1.0330937625840306
# [ read_0.6 ] speed (GB/s): mean = 0.5654617667430214, std = 0.03666601160766624, n = 10; total time_elapse: 1.071873253211379
# [ read_0.4 ] speed (GB/s): mean = 0.46550180257149, std = 0.05855073295937812, n = 10; total time_elapse: 0.8783506583422422
# [ read_0.2 ] speed (GB/s): mean = 0.27125950048361597, std = 0.023256912005982042, n = 10; total time_elapse: 0.7473218226805329
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org