I have a large (>10gb) txt
file on S3 that I trying to load into pandas. I have tried a couple different approaches, but haven't been able to successfully load / read the data.
Python 3.8.12
As for the memory, I am running this on an instance with 32 Gib of RAM, so I don't think it's a memory issue.
import pandas as pd
import boto3
# AWS credentials
import boto3
aws_id = 'xxxx'
aws_secret = 'xxxx'
Client = boto3.client(
's3',
aws_access_key_id=aws_id,
aws_secret_access_key=aws_secret
)
# Read data from S3
result = Client.get_object(Bucket="bucket-1", Key = "file1.txt")
print(result)
# To Pandas DataFrame
df_loan = pd.read_csv(io.BytesIO(result['Body'].read()), sep="\t", dtype='object', header=None)
Traceback:
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
/tmp/ipykernel_4239/3915305936.py in <cell line: 2>()
1 # To Pandas DataFrame
----> 2 df_loan = pd.read_csv(io.BytesIO(result['Body'].read()), sep="\t", dtype='object', header=None)
~/anaconda3/envs/python3/lib/python3.8/site-packages/botocore/response.py in read(self, amt)
97 """
98 try:
---> 99 chunk = self._raw_stream.read(amt)
100 except URLLib3ReadTimeoutError as e:
101 # TODO: the url will be None as urllib3 isn't setting it yet
~/anaconda3/envs/python3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
513 if amt is None:
514 # cStringIO doesn't like amt=None
--> 515 data = self._fp.read() if not fp_closed else b""
516 flush_decoder = True
517 else:
~/anaconda3/envs/python3/lib/python3.8/http/client.py in read(self, amt)
470 else:
471 try:
--> 472 s = self._safe_read(self.length)
473 except IncompleteRead:
474 self._close_conn()
~/anaconda3/envs/python3/lib/python3.8/http/client.py in _safe_read(self, amt)
611 IncompleteRead exception can be used to detect the problem.
612 """
--> 613 data = self.fp.read(amt)
614 if len(data) < amt:
615 raise IncompleteRead(data, amt-len(data))
~/anaconda3/envs/python3/lib/python3.8/socket.py in readinto(self, b)
667 while True:
668 try:
--> 669 return self._sock.recv_into(b)
670 except timeout:
671 self._timeout_occurred = True
~/anaconda3/envs/python3/lib/python3.8/ssl.py in recv_into(self, buffer, nbytes, flags)
1239 "non-zero flags not allowed in calls to recv_into() on %s" %
1240 self.__class__)
-> 1241 return self.read(nbytes, buffer)
1242 else:
1243 return super().recv_into(buffer, nbytes, flags)
~/anaconda3/envs/python3/lib/python3.8/ssl.py in read(self, len, buffer)
1097 try:
1098 if buffer is not None:
-> 1099 return self._sslobj.read(len, buffer)
1100 else:
1101 return self._sslobj.read(len)
OverflowError: signed integer is greater than maximum