RumbleDB · ghislainfourny · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/README.md b/README.md
@@ -338,6 +338,11 @@ Even more queries can be found [here](https://colab.research.google.com/github/R
 
 # Latest updates
 
+## Version 2.0.0 alpha 1
+- When returning a single-column DataFrame with atomic values, the name is now __value and not value to avoid collisions with user-defined columns.
+- Improved schema inferrence: DataFrames can be returned in a wider range of cases.
+- Improved error display in notebooks when errors happen upon collecting the results and not already upon calling jsoniq().
+
 ## Version 0.2.0 alpha 9
 - Stability improvements.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "jsoniq"
-version = "0.2.0a9"
+version = "2.0.0a1"
 description = "Python edition of RumbleDB, a JSONiq engine"
 requires-python = ">=3.11"
 dependencies = [

diff --git a/src/jsoniq/jars/rumbledb-1.24.0.jar → src/jsoniq/jars/rumbledb-2.0.0.jar b/src/jsoniq/jars/rumbledb-1.24.0.jar → src/jsoniq/jars/rumbledb-2.0.0.jar
diff --git a/src/jsoniq/sequence.py b/src/jsoniq/sequence.py
@@ -54,16 +54,10 @@ def rdd(self):
         return self._rumblesession.lastResult
 
     def df(self):
-        if (not "DataFrame" in self._jsequence.availableOutputs()):
-            sys.stderr.write(self.schema_str)
-            return None
         self._rumblesession.lastResult = DataFrame(self._jsequence.getAsDataFrame(), self._sparksession)
         return self._rumblesession.lastResult
 
     def pdf(self):
-        if (not "DataFrame" in self._jsequence.availableOutputs()):
-            sys.stderr.write(self.schema_str)
-            return None
         self._rumblesession.lastResult = self.df().toPandas()
         return self._rumblesession.lastResult
 

diff --git a/src/jsoniq/session.py b/src/jsoniq/session.py
@@ -7,7 +7,7 @@
 import pandas as pd
 import importlib.resources as pkg_resources
 
-with pkg_resources.path("jsoniq.jars", "rumbledb-1.24.0.jar") as jar_path:
+with pkg_resources.path("jsoniq.jars", "rumbledb-2.0.0.jar") as jar_path:
     jar_path_str = "file://" + str(jar_path)
 
 def get_spark_version():
@@ -88,10 +88,13 @@ def getOrCreate(self):
                         sys.stderr.write("[Error] Could not determine Spark version. The SPARK_HOME environment variable may not be set properly. Please check that it points to a valid path to a Spark 4.0 directory, or maybe the easiest would be to delete the environment variable SPARK_HOME completely to fall back to the installation of Spark 4.0 packaged with pyspark.\n")
                         sys.stderr.write(f"Current value of SPARK_HOME: {os.environ.get('SPARK_HOME')}\n")
                         sys.exit(43)
-                    elif not spark_version.startswith("4.0"):
+                    elif not os.environ.get('SPARK_HOME') is None and not spark_version.startswith("4.0"):
                         sys.stderr.write(f"[Error] RumbleDB requires Spark 4.0, but found version {spark_version}. Please either set SPARK_HOME to a Spark 4.0 directory, or maybe the easiest would be to delete the environment variable SPARK_HOME completely to fall back to the installation of Spark 4.0 packaged with pyspark.\n")
                         sys.exit(43)
                     else:
+                        sys.stderr.write(f"[Error] SPARK_HOME is not set, but somehow pyspark is not falling back to the packaged Spark 4.0.0 version.\n")
+                        sys.stderr.write(f"We would appreciate a bug report with some information about your OS, setup, etc.\n")
+                        sys.stderr.write(f"In the meantime, what you could do as a workaround is download the Spark 4.0.0 zip file from spark.apache.org, unzip it to some local directory, and point SPARK_HOME to this directory.\n")
                         raise e
             return RumbleSession._rumbleSession
 

diff --git a/src/jsoniqmagic/magic.py b/src/jsoniqmagic/magic.py
@@ -67,24 +67,80 @@ def run(self, line, cell=None, timed=False):
 """
 
         if(args.pyspark_data_frame):
-            df = response.df();
+            try:
+                df = response.df();
+            except Py4JJavaError as e:
+                print(e.java_exception.getMessage())
+                return
+            except Exception as e:
+                print("Query unsuccessful.")
+                print("Usual reasons: firewall, misconfigured proxy.")
+                print("Error message:")
+                print(e.args[0])
+                return
+            except:
+                print("Query unsuccessful.")
+                print("Usual reasons: firewall, misconfigured proxy.")
+                return
             if df is not None:
                 df.show()
 
         if (args.pandas_data_frame):
-            pdf = response.pdf()
+            try:
+                pdf = response.pdf()
+            except Py4JJavaError as e:
+                print(e.java_exception.getMessage())
+                return
+            except Exception as e:
+                print("Query unsuccessful.")
+                print("Usual reasons: firewall, misconfigured proxy.")
+                print("Error message:")
+                print(e.args[0])
+                return
+            except:
+                print("Query unsuccessful.")
+                print("Usual reasons: firewall, misconfigured proxy.")
+                return
             if pdf is not None:
                 print(pdf)
 
         if (args.apply_updates):
             if ("PUL" in response.availableOutputs()):
-                response.applyPUL()
+                try:
+                    response.applyPUL()
+                except Py4JJavaError as e:
+                    print(e.java_exception.getMessage())
+                    return
+                except Exception as e:
+                    print("Query unsuccessful.")
+                    print("Usual reasons: firewall, misconfigured proxy.")
+                    print("Error message:")
+                    print(e.args[0])
+                    return
+                except:
+                    print("Query unsuccessful.")
+                    print("Usual reasons: firewall, misconfigured proxy.")
+                    return  
                 print("Updates applied successfully.")
             else:
                 print("No Pending Update List (PUL) available to apply.")
 
         if (args.json or (not args.pandas_data_frame and not args.pyspark_data_frame)):
-            capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1)
+            try:
+                capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1)
+            except Py4JJavaError as e:
+                print(e.java_exception.getMessage())
+                return
+            except Exception as e:
+                print("Query unsuccessful.")
+                print("Usual reasons: firewall, misconfigured proxy.")
+                print("Error message:")
+                print(e.args[0])
+                return
+            except:
+                print("Query unsuccessful.")
+                print("Usual reasons: firewall, misconfigured proxy.")
+                return  
             if len(capplusone) > rumble.getRumbleConf().getResultSizeCap():
                 count = response.count()
                 print("The query output %s items, which is too many to display. Displaying the first %s items:" % (count, rumble.getRumbleConf().getResultSizeCap()))