diff --git a/README.md b/README.md index 33d6257..7f165b0 100644 --- a/README.md +++ b/README.md @@ -338,6 +338,11 @@ Even more queries can be found [here](https://colab.research.google.com/github/R # Latest updates +## Version 2.0.0 alpha 1 +- When returning a single-column DataFrame with atomic values, the name is now __value and not value to avoid collisions with user-defined columns. +- Improved schema inferrence: DataFrames can be returned in a wider range of cases. +- Improved error display in notebooks when errors happen upon collecting the results and not already upon calling jsoniq(). + ## Version 0.2.0 alpha 9 - Stability improvements. diff --git a/pyproject.toml b/pyproject.toml index 8adad2d..faa6e4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "jsoniq" -version = "0.2.0a9" +version = "2.0.0a1" description = "Python edition of RumbleDB, a JSONiq engine" requires-python = ">=3.11" dependencies = [ diff --git a/src/jsoniq/jars/rumbledb-1.24.0.jar b/src/jsoniq/jars/rumbledb-2.0.0.jar similarity index 92% rename from src/jsoniq/jars/rumbledb-1.24.0.jar rename to src/jsoniq/jars/rumbledb-2.0.0.jar index 089086f..101093e 100644 Binary files a/src/jsoniq/jars/rumbledb-1.24.0.jar and b/src/jsoniq/jars/rumbledb-2.0.0.jar differ diff --git a/src/jsoniq/sequence.py b/src/jsoniq/sequence.py index b4844ad..2bb729d 100644 --- a/src/jsoniq/sequence.py +++ b/src/jsoniq/sequence.py @@ -54,16 +54,10 @@ def rdd(self): return self._rumblesession.lastResult def df(self): - if (not "DataFrame" in self._jsequence.availableOutputs()): - sys.stderr.write(self.schema_str) - return None self._rumblesession.lastResult = DataFrame(self._jsequence.getAsDataFrame(), self._sparksession) return self._rumblesession.lastResult def pdf(self): - if (not "DataFrame" in self._jsequence.availableOutputs()): - sys.stderr.write(self.schema_str) - return None self._rumblesession.lastResult = self.df().toPandas() return self._rumblesession.lastResult diff --git a/src/jsoniq/session.py b/src/jsoniq/session.py index 5d5c9bf..d26e307 100644 --- a/src/jsoniq/session.py +++ b/src/jsoniq/session.py @@ -7,7 +7,7 @@ import pandas as pd import importlib.resources as pkg_resources -with pkg_resources.path("jsoniq.jars", "rumbledb-1.24.0.jar") as jar_path: +with pkg_resources.path("jsoniq.jars", "rumbledb-2.0.0.jar") as jar_path: jar_path_str = "file://" + str(jar_path) def get_spark_version(): @@ -88,10 +88,13 @@ def getOrCreate(self): sys.stderr.write("[Error] Could not determine Spark version. The SPARK_HOME environment variable may not be set properly. Please check that it points to a valid path to a Spark 4.0 directory, or maybe the easiest would be to delete the environment variable SPARK_HOME completely to fall back to the installation of Spark 4.0 packaged with pyspark.\n") sys.stderr.write(f"Current value of SPARK_HOME: {os.environ.get('SPARK_HOME')}\n") sys.exit(43) - elif not spark_version.startswith("4.0"): + elif not os.environ.get('SPARK_HOME') is None and not spark_version.startswith("4.0"): sys.stderr.write(f"[Error] RumbleDB requires Spark 4.0, but found version {spark_version}. Please either set SPARK_HOME to a Spark 4.0 directory, or maybe the easiest would be to delete the environment variable SPARK_HOME completely to fall back to the installation of Spark 4.0 packaged with pyspark.\n") sys.exit(43) else: + sys.stderr.write(f"[Error] SPARK_HOME is not set, but somehow pyspark is not falling back to the packaged Spark 4.0.0 version.\n") + sys.stderr.write(f"We would appreciate a bug report with some information about your OS, setup, etc.\n") + sys.stderr.write(f"In the meantime, what you could do as a workaround is download the Spark 4.0.0 zip file from spark.apache.org, unzip it to some local directory, and point SPARK_HOME to this directory.\n") raise e return RumbleSession._rumbleSession diff --git a/src/jsoniqmagic/magic.py b/src/jsoniqmagic/magic.py index d47adc5..6f6d7f8 100644 --- a/src/jsoniqmagic/magic.py +++ b/src/jsoniqmagic/magic.py @@ -67,24 +67,80 @@ def run(self, line, cell=None, timed=False): """ if(args.pyspark_data_frame): - df = response.df(); + try: + df = response.df(); + except Py4JJavaError as e: + print(e.java_exception.getMessage()) + return + except Exception as e: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + print("Error message:") + print(e.args[0]) + return + except: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + return if df is not None: df.show() if (args.pandas_data_frame): - pdf = response.pdf() + try: + pdf = response.pdf() + except Py4JJavaError as e: + print(e.java_exception.getMessage()) + return + except Exception as e: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + print("Error message:") + print(e.args[0]) + return + except: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + return if pdf is not None: print(pdf) if (args.apply_updates): if ("PUL" in response.availableOutputs()): - response.applyPUL() + try: + response.applyPUL() + except Py4JJavaError as e: + print(e.java_exception.getMessage()) + return + except Exception as e: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + print("Error message:") + print(e.args[0]) + return + except: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + return print("Updates applied successfully.") else: print("No Pending Update List (PUL) available to apply.") if (args.json or (not args.pandas_data_frame and not args.pyspark_data_frame)): - capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1) + try: + capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1) + except Py4JJavaError as e: + print(e.java_exception.getMessage()) + return + except Exception as e: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + print("Error message:") + print(e.args[0]) + return + except: + print("Query unsuccessful.") + print("Usual reasons: firewall, misconfigured proxy.") + return if len(capplusone) > rumble.getRumbleConf().getResultSizeCap(): count = response.count() print("The query output %s items, which is too many to display. Displaying the first %s items:" % (count, rumble.getRumbleConf().getResultSizeCap()))