我正在尝试使用 databricks 最新版本(7.1+,spark 3.0)与 pyspark 作为脚本编辑器/基本语言连接 bigquery。完成此操作的步骤:
- 将 bigquery API 传递给数据块进行连接
- 在 databricks 中安装了 Spark bigquery 最新的 jar 我们运行了下面的 pyspark 脚本来从 bigquery 表中获取数据到 databricks
from pyspark.sql import SparkSession
spark = (
SparkSession.builder
.appName('bq')
.master('local[4]')
.config('parentProject', 'google-project-ID')
.config('spark.jars', 'jarlocation.jar') \
.getOrCreate()
)
df = spark.read.format("bigquery").option("credentialsFile", "file path") \
.option("parentProject", "google-project-ID") \
.option("project", "Dataset-Name") \
.option("table","dataset.schema.tablename*") \
.load()
运行脚本后,当我们尝试查看数据时,出现以下错误。
错误:
com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.InvalidArgumentException: com.google.cloud.spark.bigquery.repackaged.io.grpc.StatusRuntimeException: INVALID_ARGUMENT: request failed: wildcard tables are not supported
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.ApiExceptionFactory.createException(ApiExceptionFactory.java:49)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:72)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:60)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.grpc.GrpcExceptionCallable$ExceptionTransformingFuture.onFailure(GrpcExceptionCallable.java:97)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.core.ApiFutures$1.onFailure(ApiFutures.java:68)
at com.google.cloud.spark.bigquery.repackaged.com.google.common.util.concurrent.Futures$CallbackListener.run(Futures.java:1039)
at com.google.cloud.spark.bigquery.repackaged.com.google.common.util.concurrent.DirectExecutor.execute(DirectExecutor.java:30)
at com.google.cloud.spark.bigquery.repackaged.com.google.common.util.concurrent.AbstractFuture.executeListener(AbstractFuture.java:1165)
at com.google.cloud.spark.bigquery.repackaged.com.google.common.util.concurrent.AbstractFuture.complete(AbstractFuture.java:958)
at com.google.cloud.spark.bigquery.repackaged.com.google.common.util.concurrent.AbstractFuture.setException(AbstractFuture.java:749)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.stub.ClientCalls$GrpcFuture.setException(ClientCalls.java:522)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.stub.ClientCalls$UnaryStreamToFuture.onClose(ClientCalls.java:497)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:426)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.ClientCallImpl.access$500(ClientCallImpl.java:66)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl.close(ClientCallImpl.java:689)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl.access$900(ClientCallImpl.java:577)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInternal(ClientCallImpl.java:751)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1StreamClosed.runInContext(ClientCallImpl.java:740)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at com.google.cloud.spark.bigquery.repackaged.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:123)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Suppressed: com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.AsyncTaskException: Asynchronous task failed
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.ApiExceptions.callAndTranslateApiException(ApiExceptions.java:57)
at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.rpc.UnaryCallable.call(UnaryCallable.java:112)
at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.storage.v1beta2.BigQueryReadClient.createReadSession(BigQueryReadClient.java:230)
at com.google.cloud.spark.bigquery.direct.DirectBigQueryRelation.buildScan(DirectBigQueryRelation.scala:135)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy.$anonfun$apply$2(DataSourceStrategy.scala:375)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy.pruneFilterProjectRaw(DataSourceStrategy.scala:472)
at org.apache.spark.sql.execution.datasources.DataSourceStrategy.apply(DataSourceStrategy.scala:374)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:67)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:97)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:74)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:82)
at scala.collection.TraversableOnce.$anonfun$foldLeft$1(TraversableOnce.scala:162)
at scala.collection.TraversableOnce.$anonfun$foldLeft$1$adapted(TraversableOnce.scala:162)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:162)
at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:160)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:79)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:97)
at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:74)
at org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:427)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:124)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:171)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:836)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:171)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:124)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:117)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:139)
at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:171)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:836)
at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:171)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:135)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:131)
at org.apache.spark.sql.execution.QueryExecution.$anonfun$writePlans$5(QueryExecution.scala:248)
at org.apache.spark.sql.catalyst.plans.QueryPlan$.append(QueryPlan.scala:466)
at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$writePlans(QueryExecution.scala:248)
at org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:256)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:109)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:249)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:836)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:199)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3700)
at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:2977)
at com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation0(OutputAggregator.scala:194)
at com.databricks.backend.daemon.driver.OutputAggregator$.withOutputAggregation(OutputAggregator.scala:57)
at com.databricks.backend.daemon.driver.PythonDriverLocal.generateTableResult(PythonDriverLocal.scala:1154)
at com.databricks.backend.daemon.driver.PythonDriverLocal.$anonfun$getResultBufferInternal$1(PythonDriverLocal.scala:1066)
at com.databricks.backend.daemon.driver.PythonDriverLocal.withInterpLock(PythonDriverLocal.scala:853)
at com.databricks.backend.daemon.driver.PythonDriverLocal.getResultBufferInternal(PythonDriverLocal.scala:935)
at com.databricks.backend.daemon.driver.DriverLocal.getResultBuffer(DriverLocal.scala:538)
at com.databricks.backend.daemon.driver.PythonDriverLocal.outputSuccess(PythonDriverLocal.scala:895)
at com.databricks.backend.daemon.driver.PythonDriverLocal.$anonfun$repl$8(PythonDriverLocal.scala:380)
at com.databricks.backend.daemon.driver.PythonDriverLocal.withInterpLock(PythonDriverLocal.scala:853)
at com.databricks.backend.daemon.driver.PythonDriverLocal.repl(PythonDriverLocal.scala:367)
at com.databricks.backend.daemon.driver.DriverLocal.$anonfun$execute$10(DriverLocal.scala:431)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:239)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:234)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:231)
at com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:48)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:276)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:269)
at com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:48)
at com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:408)
at com.databricks.backend.daemon.driver.DriverWrapper.$anonfun$tryExecutingCommand$1(DriverWrapper.scala:653)
at scala.util.Try$.apply(Try.scala:213)
at com.databricks.backend.daemon.driver.DriverWrapper.tryExecutingCommand(DriverWrapper.scala:645)
at com.databricks.backend.daemon.driver.DriverWrapper.getCommandOutputAndError(DriverWrapper.scala:486)
at com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:598)
at com.databricks.backend.daemon.driver.DriverWrapper.runInnerLoop(DriverWrapper.scala:391)
at com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:337)
at com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:219)
... 1 more
Caused by: com.google.cloud.spark.bigquery.repackaged.io.grpc.StatusRuntimeException: INVALID_ARGUMENT: request failed: wildcard tables are not supported
at com.google.cloud.spark.bigquery.repackaged.io.grpc.Status.asRuntimeException(Status.java:533)
... 16 more
但是,如果我们尝试在表名中不提供“*”的情况下获取数据,我们就能够以嵌套格式获取数据。有没有办法在单个脚本中获取单个模式下的所有表?
帮助将不胜感激。提前致谢!