I am trying to load a CSV file with only 237 rows. I am using the below Python code for that.
Also, I am trying to generate a CSV file with the "bad" rows for analysis. One column called "createDate" must be inserted as NUMBER.
The problem is when I convert that to string in data frame, 233 records gets inserted but 237 - 233 = 4 rows not in my log file.
When I try to insert as Int, then only 125 rows inserted, the log file contains only 5 rows.
What I am missing here? Why it is inserting fewer rows and if not inserting why they are not in my log file?
#import required libraries..
import json
import boto3
import csv
import sys
import pandas as pd
import ast
import botocore
from botocore.config import Config
import asyncio
from botocore.exceptions import ClientError
# Saubhik: This will load csv file from local to AWS DynamoDB.
#Access Credentials
###################
#Change the access key as per your need.
a_key = "AAAAAAAAAA"
# Change the sccret key as per your need.
a_S_key = "mmxxxxxxx+maskedHAM"
# Change the region as required
region= 'ap-south-1'
#Dataset contains lots of duplicate, So overwrite_keys provided. You cannot perform multiple
# operations on the same item in the same BatchWriteItem request.
overwrite_keys = ["mpmID"]
#CSV file name, change this as per your need
filename="C:\Saubhik\Project\DynamoDB\CSV\mvw_at04162023.csv"
# Error file name.
errorfile = "C:\Saubhik\Project\DynamoDB\CSV\mycsv_error.csv"
# Connecting to DynamoDB
dynamodb = boto3.resource("dynamodb",
aws_access_key_id=a_key,
aws_secret_access_key=a_S_key,
region_name=region,
config=Config(
retries={"max_attempts": 5 , "mode": "adaptive"},
max_pool_connections=1024,
)
)
# DynamoDB table name, change it as per need.
try:
table = dynamodb.Table("mvwcsv8jun")
except Exception as error:
print("Error loading DynamoDB table. Check if table was created correctly and environment variable")
print (error)
# CSV Reading using Panda data frame - SB.
Customers = pd.read_csv(filename, dtype={'createDate': 'Int32',
'cognitoSub':str,
'customerIdHash':str,
'username':str,
'mpmID':str}
)
#Reconvert everything to string.
#If I uncomment this line, DynamoDB is inserting 233 rows (almost all)
# If I comment out the line DynamoDB is inserting 125 rows, but in log files only 4 rows.
# The error is "Float types are not supported. Use Decimal types instead."
#Customers = Customers.astype(str)
#Trying to write in batch to get faster inserts. overwrite_by_pkeys=overwrite_keys
try:
with table.batch_writer(overwrite_by_pkeys=overwrite_keys) as batch:
for i,record in enumerate(Customers.to_dict("records")):
# add to dynamodb
try:
batch.put_item(
Item = (record)
)
"""
if len(batch['UnprocessedItems']) == 0:
print('Wrote 25 items to dynamo')
else:
#await asyncio.sleep(5)
print('Hit write limit, backing off then retrying')
print(batch['UnprocessedItems'])
"""
except Exception as error:
print(error)
print(record)
print("End of Data Load.. starting new batch automatically..")
#Writing an error file.
try:
with open(errorfile, 'a') as f:
f.writelines(str(record["mpmID"])+","+str(record["cognitoSub"])+","+str(record["customerIdHash"])
+","+str(record["username"])+","+str(record["createDate"])+","+str(error)+'\n')
except Exception as error:
print(error)
except Exception as error:
print(error)
For 125 rows this is the log:
Float types are not supported. Use Decimal types instead.
{'cognitoSub': 'b61016f2-2172-4125-811d-b5b63d501386', 'customerIdHash': 'c553dc3283c858d600428ab9da98653a77b7cd25b18d42f7e342f50e3dd2811d', 'username': 'baba123', 'createDate': 1621267106, 'mpmID': 'a0U7i0000065Z8xEAE'}
End of Data Load.. starting new batch automatically..
Float types are not supported. Use Decimal types instead.
{'cognitoSub': '3df39da3-0b81-4829-a7bc-963229d80689', 'customerIdHash': '5ffc0526350ea7b88d8ecfb180401b69a9a62560f10d8a0a0ac8190f58526c3a', 'username': 'albertoperez11593478', 'createDate': 1678917593, 'mpmID': 'a0U7i0000064pelEAA'}
End of Data Load.. starting new batch automatically..
Float types are not supported. Use Decimal types instead.
{'cognitoSub': '145e8cbc-0641-4b1b-b348-5fce4737ffab', 'customerIdHash': '90aa5ce118ee22ba9886562b3610925a6d9756ca5cd5cef44c56d348ec222d36', 'username': 'jones_jeff1', 'createDate': 1584473346, 'mpmID': 'a0U7i00000661PnEAI'}
End of Data Load.. starting new batch automatically..
Float types are not supported. Use Decimal types instead.
{'cognitoSub': '8cb9eb6d-bd4b-45eb-8e8d-9b9a585beea3', 'customerIdHash': '44719f84a7bcab4b8e35365e1d5903269e287a19279fc669bcb6a67b4c29ea97', 'username': 'shaban1', 'createDate': 1622571432, 'mpmID': 'a0U7i0000065YGHEA2'}
End of Data Load.. starting new batch automatically..
Float types are not supported. Use Decimal types instead.
For 233 rows (as string) there is none.
Thanks for your answer. I can convert that column to integer when reading into data frame: Customers = pd.read_csv(filename, dtype={'createDate': 'Int32', 'cognitoSub':str, 'customerIdHash':str, 'username':str, 'mpmID':str} The problem is then it is inserting only 145 record out of 237 and log file contains only 5 rows with that error, So, I am not sure what is happening with rest 87 odd rows. Latter If I use Customers = Customers.astype(str) then it is inserting 233 rows but nor rows in my log file, So not sure about what happens to rest 4 rows.