Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,11 @@ Even more queries can be found [here](https://colab.research.google.com/github/R

# Latest updates

## Version 2.0.0 alpha 1
- When returning a single-column DataFrame with atomic values, the name is now __value and not value to avoid collisions with user-defined columns.
- Improved schema inferrence: DataFrames can be returned in a wider range of cases.
- Improved error display in notebooks when errors happen upon collecting the results and not already upon calling jsoniq().

## Version 0.2.0 alpha 9
- Stability improvements.

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "jsoniq"
version = "0.2.0a9"
version = "2.0.0a1"
description = "Python edition of RumbleDB, a JSONiq engine"
requires-python = ">=3.11"
dependencies = [
Expand Down
Binary file not shown.
6 changes: 0 additions & 6 deletions src/jsoniq/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,10 @@ def rdd(self):
return self._rumblesession.lastResult

def df(self):
if (not "DataFrame" in self._jsequence.availableOutputs()):
sys.stderr.write(self.schema_str)
return None
self._rumblesession.lastResult = DataFrame(self._jsequence.getAsDataFrame(), self._sparksession)
return self._rumblesession.lastResult

def pdf(self):
if (not "DataFrame" in self._jsequence.availableOutputs()):
sys.stderr.write(self.schema_str)
return None
self._rumblesession.lastResult = self.df().toPandas()
return self._rumblesession.lastResult

Expand Down
7 changes: 5 additions & 2 deletions src/jsoniq/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
import importlib.resources as pkg_resources

with pkg_resources.path("jsoniq.jars", "rumbledb-1.24.0.jar") as jar_path:
with pkg_resources.path("jsoniq.jars", "rumbledb-2.0.0.jar") as jar_path:
jar_path_str = "file://" + str(jar_path)

def get_spark_version():
Expand Down Expand Up @@ -88,10 +88,13 @@ def getOrCreate(self):
sys.stderr.write("[Error] Could not determine Spark version. The SPARK_HOME environment variable may not be set properly. Please check that it points to a valid path to a Spark 4.0 directory, or maybe the easiest would be to delete the environment variable SPARK_HOME completely to fall back to the installation of Spark 4.0 packaged with pyspark.\n")
sys.stderr.write(f"Current value of SPARK_HOME: {os.environ.get('SPARK_HOME')}\n")
sys.exit(43)
elif not spark_version.startswith("4.0"):
elif not os.environ.get('SPARK_HOME') is None and not spark_version.startswith("4.0"):
sys.stderr.write(f"[Error] RumbleDB requires Spark 4.0, but found version {spark_version}. Please either set SPARK_HOME to a Spark 4.0 directory, or maybe the easiest would be to delete the environment variable SPARK_HOME completely to fall back to the installation of Spark 4.0 packaged with pyspark.\n")
sys.exit(43)
else:
sys.stderr.write(f"[Error] SPARK_HOME is not set, but somehow pyspark is not falling back to the packaged Spark 4.0.0 version.\n")
sys.stderr.write(f"We would appreciate a bug report with some information about your OS, setup, etc.\n")
sys.stderr.write(f"In the meantime, what you could do as a workaround is download the Spark 4.0.0 zip file from spark.apache.org, unzip it to some local directory, and point SPARK_HOME to this directory.\n")
raise e
return RumbleSession._rumbleSession

Expand Down
64 changes: 60 additions & 4 deletions src/jsoniqmagic/magic.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,24 +67,80 @@ def run(self, line, cell=None, timed=False):
"""

if(args.pyspark_data_frame):
df = response.df();
try:
df = response.df();
except Py4JJavaError as e:
print(e.java_exception.getMessage())
return
except Exception as e:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
print("Error message:")
print(e.args[0])
return
except:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
return
if df is not None:
df.show()

if (args.pandas_data_frame):
pdf = response.pdf()
try:
pdf = response.pdf()
except Py4JJavaError as e:
print(e.java_exception.getMessage())
return
except Exception as e:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
print("Error message:")
print(e.args[0])
return
except:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
return
if pdf is not None:
print(pdf)

if (args.apply_updates):
if ("PUL" in response.availableOutputs()):
response.applyPUL()
try:
response.applyPUL()
except Py4JJavaError as e:
print(e.java_exception.getMessage())
return
except Exception as e:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
print("Error message:")
print(e.args[0])
return
except:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
return
print("Updates applied successfully.")
else:
print("No Pending Update List (PUL) available to apply.")

if (args.json or (not args.pandas_data_frame and not args.pyspark_data_frame)):
capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1)
try:
capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1)
except Py4JJavaError as e:
print(e.java_exception.getMessage())
return
except Exception as e:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
print("Error message:")
print(e.args[0])
return
except:
print("Query unsuccessful.")
print("Usual reasons: firewall, misconfigured proxy.")
return
if len(capplusone) > rumble.getRumbleConf().getResultSizeCap():
count = response.count()
print("The query output %s items, which is too many to display. Displaying the first %s items:" % (count, rumble.getRumbleConf().getResultSizeCap()))
Expand Down