0

我正在使用 Pyspark 连接到 HIVE 并获取一些数据。问题是它返回所有具有列名值的行。它返回正确的列名。只有行值不正确。

这是我的代码

hive_jar_path="C:Users/shakir/Downloads/ClouderaHiveJDBC-2.6.11.1014/ClouderaHiveJDBC-2.6.11.1014/ClouderaHiveJDBC42-2.6.11.1014/HiveJDBC42.jar"
print(hive_jar_path)
print("")

import os
os.environ["HADOOP_HOME"]="c:/users/shakir/downloads/spark/spark/spark"
import os
os.environ["SPARK_HOME"]="c:/users/shakir/downloads/spark/spark/spark"
import findspark
findspark.init()



from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession

import uuid
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", "hdfs://...../user/hive/warehouse/..../....")
    

spark.config("spark.driver.extraClassPath", hive_jar_path)
spark.config("spark.sql.hive.llap", "true")
spark.config("spark.sql.warehouse.dir", "hdfs://...../user/hive/warehouse/..../....")


spark=spark.enableHiveSupport().getOrCreate()

import databricks.koalas as ks


print("Reading Data from Hive . . .")
options={
    "fetchsize":1000,
    "inferSchema": True,
    "fileFormat":"orc",
    "inputFormat":"org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
    "outputFormat":"org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
    "driver":"org.apache.hive.jdbc.HiveDriver",
    }
df = ks.read_sql("SELECT * FROM PERSONS LIMIT 3", connection_string,options=options)
print("Done")
print(df)

代码输出:

+------+-----+---------+
| Name | Age | Address |
+------+-----+---------+
| Name | Age | Address |
+------+-----+---------+
| Name | Age | Address |
+------+-----+---------+
| Name | Age | Address |
+------+-----+---------+
4

0 回答 0