I tried this but fails.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("periglue").config('spark.sql.codegen.wholeStage', False).getOrCreate()
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", "")
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "")
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "us-east-1.amazonaws.com")
csvDf = spark.read.csv("s3a://periglue/banking.csv")
csvDf.show()
error message
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/07 14:36:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Traceback (most recent call last):
File "C:/Users/periy/PycharmProjects/pythonProject/read-s3-csv.py", line 23, in <module>
csvDf = spark.read.csv("s3a://periglue/banking.csv")
File "C:\spark\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\readwriter.py", line 535, in csv
return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
File "C:\spark\spark-3.0.1-bin-hadoop2.7\python\lib\py4j-0.10.9-src.zip\py4j\java_gateway.py", line 1304, in call
File "C:\spark\spark-3.0.1-bin-hadoop2.7\python\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)