java -version
# should be Java 8 (Oracle or OpenJDK)
conda create -n sparknlp python=3.8 -y
conda activate sparknlp
pip install spark-nlp==6.2.0 pyspark==3.3.1
pip install johnsnowlabs
pip install jupyter

import sparknlp
spark = sparknlp.start()
#spark = sparknlp.start(gpu=True)

25/11/13 11:17:24 WARN Utils: Your hostname, legion resolves to a loopback address: 127.0.1.1; using 192.168.1.2 instead (on interface enp3s0)
25/11/13 11:17:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address

:: loading settings :: url = jar:file:/home/legion/miniconda3/envs/sparknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

Ivy Default Cache set to: /home/legion/.ivy2/cache
The jars for the packages stored in: /home/legion/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5aace11f-4e53-4048-9842-108e0f27911c;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;6.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.errorprone#error_prone_annotations;2.18.0 in central
	found com.google.j2objc#j2objc-annotations;1.3 in central
	found com.google.http-client#google-http-client;1.43.0 in central
	found io.opencensus#opencensus-contrib-http-util;0.31.1 in central
	found com.google.http-client#google-http-client-jackson2;1.43.0 in central
	found com.google.http-client#google-http-client-gson;1.43.0 in central
	found com.google.api-client#google-api-client;2.2.0 in central
	found com.google.oauth-client#google-oauth-client;1.34.1 in central
	found com.google.http-client#google-http-client-apache-v2;1.43.0 in central
	found com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central
	found com.google.code.gson#gson;2.10.1 in central
	found com.google.cloud#google-cloud-core;2.12.0 in central
	found io.grpc#grpc-context;1.53.0 in central
	found com.google.auto.value#auto-value-annotations;1.10.1 in central
	found com.google.auto.value#auto-value;1.10.1 in central
	found javax.annotation#javax.annotation-api;1.3.2 in central
	found com.google.cloud#google-cloud-core-http;2.12.0 in central
	found com.google.http-client#google-http-client-appengine;1.43.0 in central
	found com.google.api#gax-httpjson;0.108.2 in central
	found com.google.cloud#google-cloud-core-grpc;2.12.0 in central
	found io.grpc#grpc-alts;1.53.0 in central
	found io.grpc#grpc-grpclb;1.53.0 in central
	found org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central
	found io.grpc#grpc-auth;1.53.0 in central
	found io.grpc#grpc-protobuf;1.53.0 in central
	found io.grpc#grpc-protobuf-lite;1.53.0 in central
	found io.grpc#grpc-core;1.53.0 in central
	found com.google.api#gax;2.23.2 in central
	found com.google.api#gax-grpc;2.23.2 in central
	found com.google.auth#google-auth-library-credentials;1.16.0 in central
	found com.google.auth#google-auth-library-oauth2-http;1.16.0 in central
	found com.google.api#api-common;2.6.2 in central
	found io.opencensus#opencensus-api;0.31.1 in central
	found com.google.api.grpc#proto-google-iam-v1;1.9.2 in central
	found com.google.protobuf#protobuf-java;3.21.12 in central
	found com.google.protobuf#protobuf-java-util;3.21.12 in central
	found com.google.api.grpc#proto-google-common-protos;2.14.2 in central
	found org.threeten#threetenbp;1.6.5 in central
	found com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found io.grpc#grpc-api;1.53.0 in central
	found io.grpc#grpc-stub;1.53.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
	found io.perfmark#perfmark-api;0.26.0 in central
	found com.google.android#annotations;4.1.1.4 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.22 in central
	found io.opencensus#opencensus-proto;0.2.0 in central
	found io.grpc#grpc-services;1.53.0 in central
	found com.google.re2j#re2j;1.6 in central
	found io.grpc#grpc-netty-shaded;1.53.0 in central
	found io.grpc#grpc-googleapis;1.53.0 in central
	found io.grpc#grpc-xds;1.53.0 in central
	found com.navigamez#greex;1.0 in central
	found dk.brics.automaton#automaton;1.11-8 in central
	found org.jsoup#jsoup;1.18.2 in central
	found jakarta.mail#jakarta.mail-api;2.1.3 in central
	found jakarta.activation#jakarta.activation-api;2.1.3 in central
	found org.eclipse.angus#angus-mail;2.0.3 in central
	found org.eclipse.angus#angus-activation;2.0.2 in central
	found org.apache.poi#poi-ooxml;4.1.2 in central
	found org.apache.poi#poi;4.1.2 in central
	found org.apache.commons#commons-collections4;4.4 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found com.zaxxer#SparseBitSet;1.2 in central
	found org.apache.poi#poi-ooxml-schemas;4.1.2 in central
	found org.apache.xmlbeans#xmlbeans;3.1.0 in central
	found org.apache.commons#commons-compress;1.19 in central
	found com.github.virtuald#curvesapi;1.06 in central
	found org.apache.poi#poi-scratchpad;4.1.2 in central
	found org.apache.pdfbox#pdfbox;2.0.28 in central
	found org.apache.pdfbox#fontbox;2.0.28 in central
	found com.vladsch.flexmark#flexmark-all;0.61.34 in central
	found com.vladsch.flexmark#flexmark;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-ast;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-collection;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-misc;0.61.34 in central
	found org.jetbrains#annotations;15.0 in central
	found com.vladsch.flexmark#flexmark-util-data;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-sequence;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-visitor;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-builder;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-dependency;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-format;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-html;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-abbreviation;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-options;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-autolink;0.61.34 in central
	found org.nibor.autolink#autolink;0.6.0 in central
	found com.vladsch.flexmark#flexmark-ext-admonition;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-anchorlink;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-aside;0.61.34 in central
	found com.vladsch.flexmark#flexmark-jira-converter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-strikethrough;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-tables;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-wikilink;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-ins;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-superscript;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-attributes;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-definition;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-emoji;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-enumerated-reference;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-escaped-character;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-footnotes;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-issues;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-tasklist;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-users;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gitlab;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-jekyll-front-matter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-yaml-front-matter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-jekyll-tag;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-media-tags;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-macros;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-xwiki-macros;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-toc;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-typographic;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-youtube-embedded;0.61.34 in central
	found com.vladsch.flexmark#flexmark-html2md-converter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-pdf-converter;0.61.34 in central
	found com.openhtmltopdf#openhtmltopdf-core;1.0.0 in central
	found com.openhtmltopdf#openhtmltopdf-pdfbox;1.0.0 in central
	found org.apache.pdfbox#xmpbox;2.0.16 in central
	found de.rototor.pdfbox#graphics2d;0.24 in central
	found com.openhtmltopdf#openhtmltopdf-rtl-support;1.0.0 in central
	found com.ibm.icu#icu4j;59.1 in central
	found com.openhtmltopdf#openhtmltopdf-jsoup-dom-converter;1.0.0 in central
	found com.vladsch.flexmark#flexmark-profile-pegdown;0.61.34 in central
	found com.vladsch.flexmark#flexmark-youtrack-converter;0.61.34 in central
	found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central
	found com.microsoft.onnxruntime#onnxruntime;1.19.2 in central
	found com.johnsnowlabs.nlp#jsl-llamacpp-cpu;1.0.2 in central
	found org.jetbrains#annotations;24.1.0 in central
	found com.johnsnowlabs.nlp#jsl-openvino-cpu_2.12;0.2.0 in central
:: resolution report :: resolve 824ms :: artifacts dl 22ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-core;1.12.500 from central in [default]
	com.amazonaws#aws-java-sdk-kms;1.12.500 from central in [default]
	com.amazonaws#aws-java-sdk-s3;1.12.500 from central in [default]
	com.amazonaws#jmespath-java;1.12.500 from central in [default]
	com.github.universal-automata#liblevenshtein;3.0.0 from central in [default]
	com.github.virtuald#curvesapi;1.06 from central in [default]
	com.google.android#annotations;4.1.1.4 from central in [default]
	com.google.api#api-common;2.6.2 from central in [default]
	com.google.api#gax;2.23.2 from central in [default]
	com.google.api#gax-grpc;2.23.2 from central in [default]
	com.google.api#gax-httpjson;0.108.2 from central in [default]
	com.google.api-client#google-api-client;2.2.0 from central in [default]
	com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]
	com.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]
	com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]
	com.google.auth#google-auth-library-credentials;1.16.0 from central in [default]
	com.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]
	com.google.auto.value#auto-value;1.10.1 from central in [default]
	com.google.auto.value#auto-value-annotations;1.10.1 from central in [default]
	com.google.cloud#google-cloud-core;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-http;2.12.0 from central in [default]
	com.google.cloud#google-cloud-storage;2.20.1 from central in [default]
	com.google.code.findbugs#jsr305;3.0.2 from central in [default]
	com.google.code.gson#gson;2.10.1 from central in [default]
	com.google.errorprone#error_prone_annotations;2.18.0 from central in [default]
	com.google.guava#failureaccess;1.0.1 from central in [default]
	com.google.guava#guava;31.1-jre from central in [default]
	com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]
	com.google.http-client#google-http-client;1.43.0 from central in [default]
	com.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]
	com.google.http-client#google-http-client-appengine;1.43.0 from central in [default]
	com.google.http-client#google-http-client-gson;1.43.0 from central in [default]
	com.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]
	com.google.j2objc#j2objc-annotations;1.3 from central in [default]
	com.google.oauth-client#google-oauth-client;1.34.1 from central in [default]
	com.google.protobuf#protobuf-java;3.21.12 from central in [default]
	com.google.protobuf#protobuf-java-util;3.21.12 from central in [default]
	com.google.re2j#re2j;1.6 from central in [default]
	com.ibm.icu#icu4j;59.1 from central in [default]
	com.johnsnowlabs.nlp#jsl-llamacpp-cpu;1.0.2 from central in [default]
	com.johnsnowlabs.nlp#jsl-openvino-cpu_2.12;0.2.0 from central in [default]
	com.johnsnowlabs.nlp#spark-nlp_2.12;6.1.3 from central in [default]
	com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]
	com.microsoft.onnxruntime#onnxruntime;1.19.2 from central in [default]
	com.navigamez#greex;1.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-core;1.0.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-jsoup-dom-converter;1.0.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-pdfbox;1.0.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-rtl-support;1.0.0 from central in [default]
	com.typesafe#config;1.4.2 from central in [default]
	com.vladsch.flexmark#flexmark;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-all;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-abbreviation;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-admonition;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-anchorlink;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-aside;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-attributes;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-autolink;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-definition;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-emoji;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-enumerated-reference;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-escaped-character;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-footnotes;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-issues;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-strikethrough;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-tasklist;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-users;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gitlab;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-ins;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-jekyll-front-matter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-jekyll-tag;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-macros;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-media-tags;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-superscript;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-tables;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-toc;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-typographic;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-wikilink;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-xwiki-macros;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-yaml-front-matter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-youtube-embedded;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-html2md-converter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-jira-converter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-pdf-converter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-profile-pegdown;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-ast;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-builder;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-collection;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-data;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-dependency;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-format;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-html;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-misc;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-options;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-sequence;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-visitor;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-youtrack-converter;0.61.34 from central in [default]
	com.zaxxer#SparseBitSet;1.2 from central in [default]
	commons-codec#commons-codec;1.15 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	de.rototor.pdfbox#graphics2d;0.24 from central in [default]
	dk.brics.automaton#automaton;1.11-8 from central in [default]
	io.grpc#grpc-alts;1.53.0 from central in [default]
	io.grpc#grpc-api;1.53.0 from central in [default]
	io.grpc#grpc-auth;1.53.0 from central in [default]
	io.grpc#grpc-context;1.53.0 from central in [default]
	io.grpc#grpc-core;1.53.0 from central in [default]
	io.grpc#grpc-googleapis;1.53.0 from central in [default]
	io.grpc#grpc-grpclb;1.53.0 from central in [default]
	io.grpc#grpc-netty-shaded;1.53.0 from central in [default]
	io.grpc#grpc-protobuf;1.53.0 from central in [default]
	io.grpc#grpc-protobuf-lite;1.53.0 from central in [default]
	io.grpc#grpc-services;1.53.0 from central in [default]
	io.grpc#grpc-stub;1.53.0 from central in [default]
	io.grpc#grpc-xds;1.53.0 from central in [default]
	io.opencensus#opencensus-api;0.31.1 from central in [default]
	io.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]
	io.opencensus#opencensus-proto;0.2.0 from central in [default]
	io.perfmark#perfmark-api;0.26.0 from central in [default]
	it.unimi.dsi#fastutil;7.0.12 from central in [default]
	jakarta.activation#jakarta.activation-api;2.1.3 from central in [default]
	jakarta.mail#jakarta.mail-api;2.1.3 from central in [default]
	javax.annotation#javax.annotation-api;1.3.2 from central in [default]
	joda-time#joda-time;2.8.1 from central in [default]
	org.apache.commons#commons-collections4;4.4 from central in [default]
	org.apache.commons#commons-compress;1.19 from central in [default]
	org.apache.commons#commons-math3;3.6.1 from central in [default]
	org.apache.httpcomponents#httpclient;4.5.13 from central in [default]
	org.apache.httpcomponents#httpcore;4.4.13 from central in [default]
	org.apache.pdfbox#fontbox;2.0.28 from central in [default]
	org.apache.pdfbox#pdfbox;2.0.28 from central in [default]
	org.apache.pdfbox#xmpbox;2.0.16 from central in [default]
	org.apache.poi#poi;4.1.2 from central in [default]
	org.apache.poi#poi-ooxml;4.1.2 from central in [default]
	org.apache.poi#poi-ooxml-schemas;4.1.2 from central in [default]
	org.apache.poi#poi-scratchpad;4.1.2 from central in [default]
	org.apache.xmlbeans#xmlbeans;3.1.0 from central in [default]
	org.checkerframework#checker-qual;3.31.0 from central in [default]
	org.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]
	org.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]
	org.eclipse.angus#angus-activation;2.0.2 from central in [default]
	org.eclipse.angus#angus-mail;2.0.3 from central in [default]
	org.jetbrains#annotations;24.1.0 from central in [default]
	org.jsoup#jsoup;1.18.2 from central in [default]
	org.nibor.autolink#autolink;0.6.0 from central in [default]
	org.projectlombok#lombok;1.16.8 from central in [default]
	org.rocksdb#rocksdbjni;6.29.5 from central in [default]
	org.threeten#threetenbp;1.6.5 from central in [default]
	software.amazon.ion#ion-java;1.0.2 from central in [default]
	:: evicted modules:
	commons-logging#commons-logging;1.2 by [commons-logging#commons-logging;1.1.3] in [default]
	commons-codec#commons-codec;1.11 by [commons-codec#commons-codec;1.15] in [default]
	com.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]
	com.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]
	com.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]
	commons-codec#commons-codec;1.13 by [commons-codec#commons-codec;1.15] in [default]
	org.jetbrains#annotations;15.0 by [org.jetbrains#annotations;24.1.0] in [default]
	org.jsoup#jsoup;1.11.3 by [org.jsoup#jsoup;1.18.2] in [default]
	org.apache.pdfbox#pdfbox;2.0.16 by [org.apache.pdfbox#pdfbox;2.0.28] in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  163  |   0   |   0   |   9   ||  154  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-5aace11f-4e53-4048-9842-108e0f27911c
	confs: [default]
	0 artifacts copied, 154 already retrieved (0kB/11ms)
25/11/13 11:17:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/13 11:17:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.

import sparknlp

spark = sparknlp.start()

from sparknlp.pretrained import PretrainedPipeline

explain_document_pipeline = PretrainedPipeline("explain_document_ml")
annotations = explain_document_pipeline.annotate("We are very happy about SparkNLP")
print(annotations)

25/11/19 07:33:23 WARN Utils: Your hostname, legion resolves to a loopback address: 127.0.1.1; using 192.168.1.2 instead (on interface enp3s0)
25/11/19 07:33:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address

:: loading settings :: url = jar:file:/home/legion/miniconda3/envs/sparknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml

Ivy Default Cache set to: /home/legion/.ivy2/cache
The jars for the packages stored in: /home/legion/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-73000933-a013-4409-92c5-17b3ebf12849;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;6.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.errorprone#error_prone_annotations;2.18.0 in central
	found com.google.j2objc#j2objc-annotations;1.3 in central
	found com.google.http-client#google-http-client;1.43.0 in central
	found io.opencensus#opencensus-contrib-http-util;0.31.1 in central
	found com.google.http-client#google-http-client-jackson2;1.43.0 in central
	found com.google.http-client#google-http-client-gson;1.43.0 in central
	found com.google.api-client#google-api-client;2.2.0 in central
	found com.google.oauth-client#google-oauth-client;1.34.1 in central
	found com.google.http-client#google-http-client-apache-v2;1.43.0 in central
	found com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central
	found com.google.code.gson#gson;2.10.1 in central
	found com.google.cloud#google-cloud-core;2.12.0 in central
	found io.grpc#grpc-context;1.53.0 in central
	found com.google.auto.value#auto-value-annotations;1.10.1 in central
	found com.google.auto.value#auto-value;1.10.1 in central
	found javax.annotation#javax.annotation-api;1.3.2 in central
	found com.google.cloud#google-cloud-core-http;2.12.0 in central
	found com.google.http-client#google-http-client-appengine;1.43.0 in central
	found com.google.api#gax-httpjson;0.108.2 in central
	found com.google.cloud#google-cloud-core-grpc;2.12.0 in central
	found io.grpc#grpc-alts;1.53.0 in central
	found io.grpc#grpc-grpclb;1.53.0 in central
	found org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central
	found io.grpc#grpc-auth;1.53.0 in central
	found io.grpc#grpc-protobuf;1.53.0 in central
	found io.grpc#grpc-protobuf-lite;1.53.0 in central
	found io.grpc#grpc-core;1.53.0 in central
	found com.google.api#gax;2.23.2 in central
	found com.google.api#gax-grpc;2.23.2 in central
	found com.google.auth#google-auth-library-credentials;1.16.0 in central
	found com.google.auth#google-auth-library-oauth2-http;1.16.0 in central
	found com.google.api#api-common;2.6.2 in central
	found io.opencensus#opencensus-api;0.31.1 in central
	found com.google.api.grpc#proto-google-iam-v1;1.9.2 in central
	found com.google.protobuf#protobuf-java;3.21.12 in central
	found com.google.protobuf#protobuf-java-util;3.21.12 in central
	found com.google.api.grpc#proto-google-common-protos;2.14.2 in central
	found org.threeten#threetenbp;1.6.5 in central
	found com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found io.grpc#grpc-api;1.53.0 in central
	found io.grpc#grpc-stub;1.53.0 in central
	found org.checkerframework#checker-qual;3.31.0 in central
	found io.perfmark#perfmark-api;0.26.0 in central
	found com.google.android#annotations;4.1.1.4 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.22 in central
	found io.opencensus#opencensus-proto;0.2.0 in central
	found io.grpc#grpc-services;1.53.0 in central
	found com.google.re2j#re2j;1.6 in central
	found io.grpc#grpc-netty-shaded;1.53.0 in central
	found io.grpc#grpc-googleapis;1.53.0 in central
	found io.grpc#grpc-xds;1.53.0 in central
	found com.navigamez#greex;1.0 in central
	found dk.brics.automaton#automaton;1.11-8 in central
	found org.jsoup#jsoup;1.18.2 in central
	found jakarta.mail#jakarta.mail-api;2.1.3 in central
	found jakarta.activation#jakarta.activation-api;2.1.3 in central
	found org.eclipse.angus#angus-mail;2.0.3 in central
	found org.eclipse.angus#angus-activation;2.0.2 in central
	found org.apache.poi#poi-ooxml;4.1.2 in central
	found org.apache.poi#poi;4.1.2 in central
	found org.apache.commons#commons-collections4;4.4 in central
	found org.apache.commons#commons-math3;3.6.1 in central
	found com.zaxxer#SparseBitSet;1.2 in central
	found org.apache.poi#poi-ooxml-schemas;4.1.2 in central
	found org.apache.xmlbeans#xmlbeans;3.1.0 in central
	found org.apache.commons#commons-compress;1.19 in central
	found com.github.virtuald#curvesapi;1.06 in central
	found org.apache.poi#poi-scratchpad;4.1.2 in central
	found org.apache.pdfbox#pdfbox;2.0.28 in central
	found org.apache.pdfbox#fontbox;2.0.28 in central
	found com.vladsch.flexmark#flexmark-all;0.61.34 in central
	found com.vladsch.flexmark#flexmark;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-ast;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-collection;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-misc;0.61.34 in central
	found org.jetbrains#annotations;15.0 in central
	found com.vladsch.flexmark#flexmark-util-data;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-sequence;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-visitor;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-builder;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-dependency;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-format;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-html;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-abbreviation;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util;0.61.34 in central
	found com.vladsch.flexmark#flexmark-util-options;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-autolink;0.61.34 in central
	found org.nibor.autolink#autolink;0.6.0 in central
	found com.vladsch.flexmark#flexmark-ext-admonition;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-anchorlink;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-aside;0.61.34 in central
	found com.vladsch.flexmark#flexmark-jira-converter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-strikethrough;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-tables;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-wikilink;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-ins;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-superscript;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-attributes;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-definition;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-emoji;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-enumerated-reference;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-escaped-character;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-footnotes;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-issues;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-tasklist;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gfm-users;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-gitlab;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-jekyll-front-matter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-yaml-front-matter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-jekyll-tag;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-media-tags;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-macros;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-xwiki-macros;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-toc;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-typographic;0.61.34 in central
	found com.vladsch.flexmark#flexmark-ext-youtube-embedded;0.61.34 in central
	found com.vladsch.flexmark#flexmark-html2md-converter;0.61.34 in central
	found com.vladsch.flexmark#flexmark-pdf-converter;0.61.34 in central
	found com.openhtmltopdf#openhtmltopdf-core;1.0.0 in central
	found com.openhtmltopdf#openhtmltopdf-pdfbox;1.0.0 in central
	found org.apache.pdfbox#xmpbox;2.0.16 in central
	found de.rototor.pdfbox#graphics2d;0.24 in central
	found com.openhtmltopdf#openhtmltopdf-rtl-support;1.0.0 in central
	found com.ibm.icu#icu4j;59.1 in central
	found com.openhtmltopdf#openhtmltopdf-jsoup-dom-converter;1.0.0 in central
	found com.vladsch.flexmark#flexmark-profile-pegdown;0.61.34 in central
	found com.vladsch.flexmark#flexmark-youtrack-converter;0.61.34 in central
	found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central
	found com.microsoft.onnxruntime#onnxruntime;1.19.2 in central
	found com.johnsnowlabs.nlp#jsl-llamacpp-cpu;1.0.2 in central
	found org.jetbrains#annotations;24.1.0 in central
	found com.johnsnowlabs.nlp#jsl-openvino-cpu_2.12;0.2.0 in central
:: resolution report :: resolve 831ms :: artifacts dl 43ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-core;1.12.500 from central in [default]
	com.amazonaws#aws-java-sdk-kms;1.12.500 from central in [default]
	com.amazonaws#aws-java-sdk-s3;1.12.500 from central in [default]
	com.amazonaws#jmespath-java;1.12.500 from central in [default]
	com.github.universal-automata#liblevenshtein;3.0.0 from central in [default]
	com.github.virtuald#curvesapi;1.06 from central in [default]
	com.google.android#annotations;4.1.1.4 from central in [default]
	com.google.api#api-common;2.6.2 from central in [default]
	com.google.api#gax;2.23.2 from central in [default]
	com.google.api#gax-grpc;2.23.2 from central in [default]
	com.google.api#gax-httpjson;0.108.2 from central in [default]
	com.google.api-client#google-api-client;2.2.0 from central in [default]
	com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]
	com.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]
	com.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]
	com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]
	com.google.auth#google-auth-library-credentials;1.16.0 from central in [default]
	com.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]
	com.google.auto.value#auto-value;1.10.1 from central in [default]
	com.google.auto.value#auto-value-annotations;1.10.1 from central in [default]
	com.google.cloud#google-cloud-core;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]
	com.google.cloud#google-cloud-core-http;2.12.0 from central in [default]
	com.google.cloud#google-cloud-storage;2.20.1 from central in [default]
	com.google.code.findbugs#jsr305;3.0.2 from central in [default]
	com.google.code.gson#gson;2.10.1 from central in [default]
	com.google.errorprone#error_prone_annotations;2.18.0 from central in [default]
	com.google.guava#failureaccess;1.0.1 from central in [default]
	com.google.guava#guava;31.1-jre from central in [default]
	com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]
	com.google.http-client#google-http-client;1.43.0 from central in [default]
	com.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]
	com.google.http-client#google-http-client-appengine;1.43.0 from central in [default]
	com.google.http-client#google-http-client-gson;1.43.0 from central in [default]
	com.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]
	com.google.j2objc#j2objc-annotations;1.3 from central in [default]
	com.google.oauth-client#google-oauth-client;1.34.1 from central in [default]
	com.google.protobuf#protobuf-java;3.21.12 from central in [default]
	com.google.protobuf#protobuf-java-util;3.21.12 from central in [default]
	com.google.re2j#re2j;1.6 from central in [default]
	com.ibm.icu#icu4j;59.1 from central in [default]
	com.johnsnowlabs.nlp#jsl-llamacpp-cpu;1.0.2 from central in [default]
	com.johnsnowlabs.nlp#jsl-openvino-cpu_2.12;0.2.0 from central in [default]
	com.johnsnowlabs.nlp#spark-nlp_2.12;6.1.3 from central in [default]
	com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]
	com.microsoft.onnxruntime#onnxruntime;1.19.2 from central in [default]
	com.navigamez#greex;1.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-core;1.0.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-jsoup-dom-converter;1.0.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-pdfbox;1.0.0 from central in [default]
	com.openhtmltopdf#openhtmltopdf-rtl-support;1.0.0 from central in [default]
	com.typesafe#config;1.4.2 from central in [default]
	com.vladsch.flexmark#flexmark;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-all;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-abbreviation;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-admonition;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-anchorlink;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-aside;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-attributes;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-autolink;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-definition;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-emoji;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-enumerated-reference;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-escaped-character;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-footnotes;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-issues;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-strikethrough;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-tasklist;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gfm-users;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-gitlab;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-ins;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-jekyll-front-matter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-jekyll-tag;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-macros;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-media-tags;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-superscript;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-tables;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-toc;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-typographic;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-wikilink;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-xwiki-macros;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-yaml-front-matter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-ext-youtube-embedded;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-html2md-converter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-jira-converter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-pdf-converter;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-profile-pegdown;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-ast;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-builder;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-collection;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-data;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-dependency;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-format;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-html;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-misc;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-options;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-sequence;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-util-visitor;0.61.34 from central in [default]
	com.vladsch.flexmark#flexmark-youtrack-converter;0.61.34 from central in [default]
	com.zaxxer#SparseBitSet;1.2 from central in [default]
	commons-codec#commons-codec;1.15 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	de.rototor.pdfbox#graphics2d;0.24 from central in [default]
	dk.brics.automaton#automaton;1.11-8 from central in [default]
	io.grpc#grpc-alts;1.53.0 from central in [default]
	io.grpc#grpc-api;1.53.0 from central in [default]
	io.grpc#grpc-auth;1.53.0 from central in [default]
	io.grpc#grpc-context;1.53.0 from central in [default]
	io.grpc#grpc-core;1.53.0 from central in [default]
	io.grpc#grpc-googleapis;1.53.0 from central in [default]
	io.grpc#grpc-grpclb;1.53.0 from central in [default]
	io.grpc#grpc-netty-shaded;1.53.0 from central in [default]
	io.grpc#grpc-protobuf;1.53.0 from central in [default]
	io.grpc#grpc-protobuf-lite;1.53.0 from central in [default]
	io.grpc#grpc-services;1.53.0 from central in [default]
	io.grpc#grpc-stub;1.53.0 from central in [default]
	io.grpc#grpc-xds;1.53.0 from central in [default]
	io.opencensus#opencensus-api;0.31.1 from central in [default]
	io.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]
	io.opencensus#opencensus-proto;0.2.0 from central in [default]
	io.perfmark#perfmark-api;0.26.0 from central in [default]
	it.unimi.dsi#fastutil;7.0.12 from central in [default]
	jakarta.activation#jakarta.activation-api;2.1.3 from central in [default]
	jakarta.mail#jakarta.mail-api;2.1.3 from central in [default]
	javax.annotation#javax.annotation-api;1.3.2 from central in [default]
	joda-time#joda-time;2.8.1 from central in [default]
	org.apache.commons#commons-collections4;4.4 from central in [default]
	org.apache.commons#commons-compress;1.19 from central in [default]
	org.apache.commons#commons-math3;3.6.1 from central in [default]
	org.apache.httpcomponents#httpclient;4.5.13 from central in [default]
	org.apache.httpcomponents#httpcore;4.4.13 from central in [default]
	org.apache.pdfbox#fontbox;2.0.28 from central in [default]
	org.apache.pdfbox#pdfbox;2.0.28 from central in [default]
	org.apache.pdfbox#xmpbox;2.0.16 from central in [default]
	org.apache.poi#poi;4.1.2 from central in [default]
	org.apache.poi#poi-ooxml;4.1.2 from central in [default]
	org.apache.poi#poi-ooxml-schemas;4.1.2 from central in [default]
	org.apache.poi#poi-scratchpad;4.1.2 from central in [default]
	org.apache.xmlbeans#xmlbeans;3.1.0 from central in [default]
	org.checkerframework#checker-qual;3.31.0 from central in [default]
	org.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]
	org.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]
	org.eclipse.angus#angus-activation;2.0.2 from central in [default]
	org.eclipse.angus#angus-mail;2.0.3 from central in [default]
	org.jetbrains#annotations;24.1.0 from central in [default]
	org.jsoup#jsoup;1.18.2 from central in [default]
	org.nibor.autolink#autolink;0.6.0 from central in [default]
	org.projectlombok#lombok;1.16.8 from central in [default]
	org.rocksdb#rocksdbjni;6.29.5 from central in [default]
	org.threeten#threetenbp;1.6.5 from central in [default]
	software.amazon.ion#ion-java;1.0.2 from central in [default]
	:: evicted modules:
	commons-logging#commons-logging;1.2 by [commons-logging#commons-logging;1.1.3] in [default]
	commons-codec#commons-codec;1.11 by [commons-codec#commons-codec;1.15] in [default]
	com.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]
	com.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]
	com.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]
	commons-codec#commons-codec;1.13 by [commons-codec#commons-codec;1.15] in [default]
	org.jetbrains#annotations;15.0 by [org.jetbrains#annotations;24.1.0] in [default]
	org.jsoup#jsoup;1.11.3 by [org.jsoup#jsoup;1.18.2] in [default]
	org.apache.pdfbox#pdfbox;2.0.16 by [org.apache.pdfbox#pdfbox;2.0.28] in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  163  |   0   |   0   |   9   ||  154  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-73000933-a013-4409-92c5-17b3ebf12849
	confs: [default]
	0 artifacts copied, 154 already retrieved (0kB/13ms)
25/11/19 07:33:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

explain_document_ml download started this may take some time.
Approx size to download 9 MB
[ | ]

25/11/19 07:33:45 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/11/19 07:33:45 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

explain_document_ml download started this may take some time.
Approximate size to download 9 MB
Download done! Loading the resource.
[OK!]
{'document': ['We are very happy about SparkNLP'], 'spell': ['We', 'are', 'very', 'happy', 'about', 'SparkNLP'], 'pos': ['PRP', 'VBP', 'RB', 'JJ', 'IN', 'NNP'], 'lemmas': ['We', 'be', 'very', 'happy', 'about', 'SparkNLP'], 'token': ['We', 'are', 'very', 'happy', 'about', 'SparkNLP'], 'stems': ['we', 'ar', 'veri', 'happi', 'about', 'sparknlp'], 'sentence': ['We are very happy about SparkNLP']}

full = explain_document_pipeline.fullAnnotate("We are very happy about SparkNLP")
# full 是一个列表，元素为字典，键即为所有可用的 annotation 字段
print("fullAnnotate keys:", list(full[0].keys()))
print("完整结构示例（token、pos、dependency 等）:")
for k in ["document", "token", "pos", "lemmas", "stems", "dependency"]:
    print(k, "->", full[0].get(k))

fullAnnotate keys: ['document', 'spell', 'pos', 'lemmas', 'token', 'stems', 'sentence']
完整结构示例（token、pos、dependency 等）:
document -> [Annotation(document, 0, 31, We are very happy about SparkNLP, {}, [])]
token -> [Annotation(token, 0, 1, We, {'sentence': '0'}, []), Annotation(token, 3, 5, are, {'sentence': '0'}, []), Annotation(token, 7, 10, very, {'sentence': '0'}, []), Annotation(token, 12, 16, happy, {'sentence': '0'}, []), Annotation(token, 18, 22, about, {'sentence': '0'}, []), Annotation(token, 24, 31, SparkNLP, {'sentence': '0'}, [])]
pos -> [Annotation(pos, 0, 1, PRP, {'word': 'We', 'sentence': '0'}, []), Annotation(pos, 3, 5, VBP, {'word': 'are', 'sentence': '0'}, []), Annotation(pos, 7, 10, RB, {'word': 'very', 'sentence': '0'}, []), Annotation(pos, 12, 16, JJ, {'word': 'happy', 'sentence': '0'}, []), Annotation(pos, 18, 22, IN, {'word': 'about', 'sentence': '0'}, []), Annotation(pos, 24, 31, NNP, {'word': 'SparkNLP', 'sentence': '0'}, [])]
lemmas -> [Annotation(token, 0, 1, We, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 3, 5, be, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 7, 10, very, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 12, 16, happy, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 18, 22, about, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 24, 31, SparkNLP, {'confidence': '0.0', 'sentence': '0'}, [])]
stems -> [Annotation(token, 0, 1, we, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 3, 5, ar, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 7, 10, veri, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 12, 16, happi, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 18, 22, about, {'confidence': '1.0', 'sentence': '0'}, []), Annotation(token, 24, 31, sparknlp, {'confidence': '0.0', 'sentence': '0'}, [])]
dependency -> None

import sparknlp
spark = sparknlp.start()

sentences = [
  ['Hello, this is an example sentence'],
  ['And this is a second sentence.']
]

# spark is the Spark Session automatically started by pyspark.
data = spark.createDataFrame(sentences).toDF("text")

# Download the pretrained pipeline from Johnsnowlab's servers
explain_document_pipeline = PretrainedPipeline("explain_document_ml")

annotations_df = explain_document_pipeline.transform(data)
annotations_df.show()

Warning::Spark Session already created, some configs may not take.
explain_document_ml download started this may take some time.

25/11/13 11:19:43 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

Approx size to download 9 MB
[OK!]

25/11/13 11:19:47 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
25/11/13 11:19:48 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
25/11/13 11:19:49 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|               spell|              lemmas|               stems|                 pos|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Hello, this is an...|[{document, 0, 33...|[{document, 0, 33...|[{token, 0, 4, He...|[{token, 0, 4, He...|[{token, 0, 4, He...|[{token, 0, 4, he...|[{pos, 0, 4, UH, ...|
|And this is a sec...|[{document, 0, 29...|[{document, 0, 29...|[{token, 0, 2, An...|[{token, 0, 2, An...|[{token, 0, 2, An...|[{token, 0, 2, an...|[{pos, 0, 2, CC, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+

25/11/13 11:19:51 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB

annotations_df.select("token").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|token                                                                                                                                                                                                                                                                                                           |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{token, 0, 4, Hello, {sentence -> 0}, []}, {token, 5, 5, ,, {sentence -> 0}, []}, {token, 7, 10, this, {sentence -> 0}, []}, {token, 12, 13, is, {sentence -> 0}, []}, {token, 15, 16, an, {sentence -> 0}, []}, {token, 18, 24, example, {sentence -> 0}, []}, {token, 26, 33, sentence, {sentence -> 0}, []}]|
|[{token, 0, 2, And, {sentence -> 0}, []}, {token, 4, 7, this, {sentence -> 0}, []}, {token, 9, 10, is, {sentence -> 0}, []}, {token, 12, 12, a, {sentence -> 0}, []}, {token, 14, 19, second, {sentence -> 0}, []}, {token, 21, 28, sentence, {sentence -> 0}, []}, {token, 29, 29, ., {sentence -> 0}, []}]    |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

from sparknlp import Finisher
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline

finisher = Finisher().setInputCols(["token", "lemmas", "pos"])
explain_pipeline_model = PretrainedPipeline("explain_document_ml").model

pipeline = Pipeline() \
    .setStages([
        explain_pipeline_model,
        finisher
        ])

sentences = [
    ['Hello, this is an example sentence'],
    ['And this is a second sentence.']
]
data = spark.createDataFrame(sentences).toDF("text")

model = pipeline.fit(data)
annotations_finished_df = model.transform(data)

annotations_finished_df.select('finished_token').show(truncate=False)

explain_document_ml download started this may take some time.

25/11/13 11:20:36 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

Approx size to download 9 MB
[OK!]
+-------------------------------------------+
|finished_token                             |
+-------------------------------------------+
|[Hello, ,, this, is, an, example, sentence]|
|[And, this, is, a, second, sentence, .]    |
+-------------------------------------------+

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("Sentence")

regexTokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

finisher = Finisher() \
    .setInputCols(["token"]) \
    .setIncludeMetadata(True)

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

finisher = Finisher() \
    .setInputCols(["token"]) \
    .setIncludeMetadata(True)

pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        sentenceDetector,
        regexTokenizer,
        finisher
    ])
data = spark.createDataFrame([("hello, this is an example sentence",)], ["text"])
model = pipeline.fit(data)
annotations = model.transform(data)
annotations.show(truncate=False)

+----------------------------------+-------------------------------------------+---------------------------------------------------------------------------------------------------------+
|text                              |finished_token                             |finished_token_metadata                                                                                  |
+----------------------------------+-------------------------------------------+---------------------------------------------------------------------------------------------------------+
|hello, this is an example sentence|[hello, ,, this, is, an, example, sentence]|[{sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}, {sentence, 0}]|
+----------------------------------+-------------------------------------------+---------------------------------------------------------------------------------------------------------+

from sparknlp.base import LightPipeline
explain_document_pipeline = PretrainedPipeline("explain_document_ml")
lightPipeline = LightPipeline(explain_document_pipeline.model)

lightPipeline.annotate("Hello world, please annotate my text")

explain_document_ml download started this may take some time.

25/11/13 11:27:18 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

Approx size to download 9 MB
[OK!]

{'document': ['Hello world, please annotate my text'],
 'spell': ['Hello', 'world', ',', 'please', 'annotate', 'my', 'text'],
 'pos': ['UH', 'NN', ',', 'VB', 'NN', 'PRP$', 'NN'],
 'lemmas': ['Hello', 'world', ',', 'please', 'annotate', 'i', 'text'],
 'token': ['Hello', 'world', ',', 'please', 'annotate', 'my', 'text'],
 'stems': ['hello', 'world', ',', 'pleas', 'annot', 'my', 'text'],
 'sentence': ['Hello world, please annotate my text']}

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline
pipeline = Pipeline().setStages([...])

from sparknlp.base import LightPipeline
LightPipeline(someTrainedPipeline).annotate(someStringOrArray)

from sparknlp.training import POS
pos = POS()
path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
posDf = pos.readDataset(spark, path, "|", "tags")
posDf.selectExpr("explode(tags) as tags").show(3, truncate=False)

+---------------------------------------+
|tags                                   |
+---------------------------------------+
|{pos, 0, 5, NNP, {word -> Pierre}, []} |
|{pos, 7, 12, NNP, {word -> Vinken}, []}|
|{pos, 14, 14, ,, {word -> ,}, []}      |
+---------------------------------------+
only showing top 3 rows

from sparknlp.training import CoNLL
trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr(
    "text",
    "token.result as tokens",
    "pos.result as pos",
    "label.result as label"
).show(3, False)

+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
only showing top 3 rows

25/11/13 11:39:42 WARN TaskSetManager: Stage 39 contains a task of very large size (4261 KiB). The maximum recommended task size is 1000 KiB.

from sparknlp.training import CoNLLU
conlluFile = "src/test/resources/conllu/en.test.conllu"
conllDataSet = CoNLLU(explodeSentences=False).readDataset(spark, conlluFile)
conllDataSet.selectExpr(
    "text",
    "form.result as form",
    "upos.result as upos",
    "xpos.result as xpos",
    "lemma.result as lemma"
).show(1, False)

+-----------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|text                                     |form                                          |upos                                         |xpos                          |lemma                                       |
+-----------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|What if Google Morphed Into GoogleOS?\n\n|[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
+-----------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
only showing top 1 row

from sparknlp.training import PubTator
pubTatorFile = "./src/test/resources/corpus_pubtator_sample.txt"
pubTatorDataSet = PubTator().readDataset(spark, pubTatorFile)
pubTatorDataSet.show(1)

25/11/13 11:41:25 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/11/13 11:41:25 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

pos_anc download started this may take some time.
Approximate size to download 3.9 MB
Download done! Loading the resource.
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|  doc_id|      finished_token|        finished_pos|        finished_ner|finished_token_metadata|finished_pos_metadata|finished_label_metadata|
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+
|25763772|[DCTN4, as, a, mo...|[NNP, IN, DT, NN,...|[B-T116, O, O, O,...|   [{sentence, 0}, {...| [{word, DCTN4}, {...|   [{word, DCTN4}, {...|
+--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+

# 依存句法分析器训练示例
# 本代码块演示如何使用 Spark NLP 训练无标签依存句法分析器。
# 依赖于 Penn Treebank 格式的依存树库（dependency_treebank）。
# 包含：文本输入、句子检测、分词、词性标注、依存句法分析器。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

# DocumentAssembler 的作用是将原始文本转换为 Spark NLP 的 Document 类型注释，
# 作为后续 NLP 流水线的输入。它是所有管道的起点，负责结构化文本数据。
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols("document") \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols("sentence") \
    .setOutputCol("token")

posTagger = PerceptronModel.pretrained() \
    .setInputCols("sentence", "token") \
    .setOutputCol("pos")

dependencyParserApproach = DependencyParserApproach() \
    .setInputCols("sentence", "pos", "token") \
    .setOutputCol("dependency") \
    .setDependencyTreeBank("src/test/resources/parser/unlabeled/dependency_treebank")

# 构建依存句法分析管道并进行推理
# 依赖于 Penn Treebank 格式的依存树库，无需额外训练数据

pipeline = Pipeline().setStages([
    documentAssembler,
    sentence,
    tokenizer,
    posTagger,
    dependencyParserApproach
])

# Additional training data is not needed, the dependency parser relies on the dependency tree bank / CoNLL-U only.
emptyDataSet = spark.createDataFrame([[""]]).toDF("text")
pipelineModel = pipeline.fit(emptyDataSet)

data = spark.createDataFrame([("Hello, this is an example sentence.",)], ["text"])
pipelineModel.transform(data).selectExpr("token.result as tokens").show(truncate=False)

pos_anc download started this may take some time.

25/11/13 11:49:02 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

Approximate size to download 3.9 MB
[OK!]
+----------------------------------------------+
|tokens                                        |
+----------------------------------------------+
|[Hello, ,, this, is, an, example, sentence, .]|
+----------------------------------------------+

# 词形还原器训练示例
# 本代码块演示如何使用 Spark NLP 训练自定义词形还原器（Lemmatizer）。
# 依赖于自定义词形还原字典（lemmas_small.txt），
# 格式为：key -> value1 value2 ...，分隔符分别为 "->" 和 "\t"
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. SentenceDetector：检测句子边界，将文本拆分为句子。
# 3. Tokenizer：将句子分词为 token。
# 4. Lemmatizer：根据自定义词形还原字典，将 token 还原为词元（lemma）。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\t")

# 构建词形还原管道并进行推理
# 包含：文本输入、句子检测、分词、词形还原
pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      sentenceDetector,
      tokenizer,
      lemmatizer
    ])

data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]) \
    .toDF("text")

result = pipeline.fit(data).transform(data)
result.selectExpr("lemma.result").show(truncate=False)

+------------------------------------------------------------------+
|result                                                            |
+------------------------------------------------------------------+
|[Peter, Pipers, employees, are, pick, peck, of, pickle, pepper, .]|
+------------------------------------------------------------------+

# 感知机词性标注器训练示例
# 本代码块演示如何使用 Spark NLP 训练自定义词性标注器（PerceptronApproach）。
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. SentenceDetector：检测句子边界，将文本拆分为句子。
# 3. Tokenizer：将句子分词为 token。
# 4. POS().readDataset：加载词性标注训练数据集，生成包含 tags 列的 DataFrame。
# 5. PerceptronApproach：根据训练数据集训练词性标注模型（trainedPos）。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

datasetPath = "src/test/resources/anc-pos-corpus-small/test-training.txt"
trainingPerceptronDF = POS().readDataset(spark, datasetPath)

trainedPos = PerceptronApproach() \
    .setInputCols(["document", "token"]) \
    .setOutputCol("pos") \
    .setPosColumn("tags") \
    .fit(trainingPerceptronDF)

# 构建词性标注管道并进行推理
# 包含：文本输入、句子检测、分词、词性标注
# 1. documentAssembler：将原始文本转换为 Document 类型注释
# 2. sentence：检测句子边界
# 3. tokenizer：将句子分词为 token
# 4. trainedPos：使用训练好的感知机词性标注器进行词性标注

# 输出解释：
# result.selectExpr("pos.result").show(truncate=False)
# 展示每个 token 的词性标注结果（如 'NN' 表示名词，'VB' 表示动词等），
# 结果为一个列表，对应输入文本中的每个分词。

pipeline = Pipeline().setStages([
    documentAssembler,
    sentence,
    tokenizer,
    trainedPos
])

data = spark.createDataFrame([["To be or not to be, is this the question?"]]).toDF("text")
result = pipeline.fit(data).transform(data)

result.selectExpr("pos.result").show(truncate=False)

+--------------------------------------------------+
|result                                            |
+--------------------------------------------------+
|[NNP, NNP, CD, JJ, NNP, NNP, ,, MD, VB, DT, CD, .]|
+--------------------------------------------------+

# 训练深度学习句子检测器（SentenceDetectorDLApproach）示例
# 本代码块演示如何使用 Spark NLP 训练自定义句子检测模型。
# 包含的流程如下：
# 1. 读取训练数据，每行为一个句子。
# 2. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 3. SentenceDetectorDLApproach：基于深度学习的句子检测器，设置输入输出列及训练轮数。
# 4. 构建管道并训练模型。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

trainingData = spark.read.text("train.txt").toDF("text")

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentenceDetector = SentenceDetectorDLApproach() \
    .setInputCols(["document"]) \
    .setOutputCol("sentences") \
    .setEpochsNumber(100)

pipeline = Pipeline().setStages([documentAssembler, sentenceDetector])

model = pipeline.fit(trainingData)

# 有标签依存句法分析器训练示例
# 本代码块演示如何使用 Spark NLP 训练有标签依存句法分析器（TypedDependencyParserApproach）。
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. SentenceDetector：检测句子边界，将文本拆分为句子。
# 3. Tokenizer：将句子分词为 token。
# 4. PerceptronModel.pretrained：加载预训练的感知机词性标注器，进行词性标注。
# 5. DependencyParserModel.pretrained：加载预训练的依存句法分析器，生成依存关系。
# 6. TypedDependencyParserApproach：根据 CoNLL-U 格式训练数据，训练有标签依存句法分析器，
# 输出词语之间的具体语法关系（如主谓、修饰、宾语等）。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

posTagger = PerceptronModel.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("pos")

dependencyParser = DependencyParserModel.pretrained() \
    .setInputCols(["sentence", "pos", "token"]) \
    .setOutputCol("dependency")

typedDependencyParser = TypedDependencyParserApproach() \
    .setInputCols(["dependency", "pos", "token"]) \
    .setOutputCol("dependency_type") \
    .setConllU("src/test/resources/parser/labeled/train_small.conllu.txt") \
    .setNumberOfIterations(1)

# 构建有标签依存句法分析管道并进行推理
# 依赖于 CoNLL-U 格式的依存树库，无需额外训练数据
# 包含：文本输入、句子检测、分词、词性标注、依存句法分析、有标签依存句法分析

pipeline = Pipeline().setStages([
    documentAssembler,
    sentence,
    tokenizer,
    posTagger,
    dependencyParser,
    typedDependencyParser
])

# Additional training data is not needed, the dependency parser relies on CoNLL-U only.
emptyDataSet = spark.createDataFrame([[""]]).toDF("text")
pipelineModel = pipeline.fit(emptyDataSet)

data = spark.createDataFrame([("Hello, this is an example sentence.",)], ["text"])
pipelineModel.transform(data).selectExpr("dependency_type.result as labeled_dependencies").show(truncate=False)

pos_anc download started this may take some time.

25/11/13 11:51:42 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

Approximate size to download 3.9 MB
[OK!]
dependency_conllu download started this may take some time.

25/11/13 11:51:46 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

Approximate size to download 16.7 MB
[ | ]

25/11/13 11:51:47 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/11/13 11:51:47 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.

dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[ | ]Download done! Loading the resource.

[Stage 70:===========================================>              (3 + 1) / 4]

[ — ]

# 中文分词器训练示例
# 本代码块演示如何使用 Spark NLP 训练中文分词器（WordSegmenterApproach）。
# 包含的流程如下：
# 1. 导入 Spark NLP 相关模块。
# 2. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 3. WordSegmenterApproach：基于词性标注的分词器，设置输入输出列、词性列和迭代次数。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

wordSegmenter = WordSegmenterApproach() \
    .setInputCols(["document"]) \
    .setOutputCol("token") \
    .setPosColumn("tags") \
    .setNIterations(5)

pipeline = Pipeline().setStages([
    documentAssembler,
    wordSegmenter
])

trainingDataSet = POS().readDataset(
    spark,
    "src/test/resources/word-segmenter/chinese_train.utf8"
)

pipelineModel = pipeline.fit(trainingDataSet)

# 拼写检查器训练示例（ContextSpellCheckerApproach）
# 本代码块演示如何使用 Spark NLP 训练上下文感知拼写纠错器。
# 包含的流程如下：
# 1. 导入 Spark NLP 相关模块。
# 2. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 3. Tokenizer：将文本分词为 token。
# 4. ContextSpellCheckerApproach：基于上下文的拼写纠错器，设置输入输出列、最大距离、批量大小、训练轮数和词汇类别数。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

spellChecker = ContextSpellCheckerApproach() \
    .setInputCols("token") \
    .setOutputCol("corrected") \
    .setWordMaxDistance(3) \
    .setBatchSize(24) \
    .setEpochs(8) \
    .setLanguageModelClasses(1650)  # dependant on vocabulary size
    # .addVocabClass("_NAME_", names) # Extra classes for correction could be added like this

# 拼写检查器训练示例（NorvigSweetingApproach）
# 本代码块演示如何使用 Spark NLP 训练 NorvigSweeting 拼写纠错器。
# 包含的流程如下：
# 1. 导入 Spark NLP 相关模块。
# 2. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 3. Tokenizer：将文本分词为 token。
# 4. NorvigSweetingApproach：基于 Norvig 拼写纠错算法，设置输入输出列和词典文件。
pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    spellChecker
])

path = "sherlockholmes.txt"
dataset = spark.read.text(path) \
    .toDF("text")
pipelineModel = pipeline.fit(dataset)

# NorvigSweeting 拼写检查器训练示例
# 本代码块演示如何使用 Spark NLP 训练 NorvigSweeting 拼写纠错器。
# 包含的流程如下：
# 1. 导入 Spark NLP 相关模块。
# 2. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 3. Tokenizer：将文本分词为 token。
# 4. NorvigSweetingApproach：基于 Norvig 拼写纠错算法，设置输入输出列和词典文件。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

spellChecker = NorvigSweetingApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell") \
    .setDictionary("src/test/resources/spell/words.txt")

pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    spellChecker
])

pipelineModel = pipeline.fit(trainingData)

25/11/13 12:19:30 WARN TaskSetManager: Stage 130 contains a task of very large size (4261 KiB). The maximum recommended task size is 1000 KiB.

# SymmetricDelete 拼写检查器训练示例
# 本代码块演示如何使用 Spark NLP 训练 SymmetricDelete 拼写纠错器。
# 包含的流程如下：
# 1. 导入 Spark NLP 相关模块。
# 2. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 3. Tokenizer：将文本分词为 token。
# 4. SymmetricDeleteApproach：基于对称删除算法的拼写纠错器，设置输入输出列和词典文件。
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

spellChecker = SymmetricDeleteApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("spell") \
    .setDictionary("src/test/resources/spell/words.txt")

pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    spellChecker
])

pipelineModel = pipeline.fit(trainingData)

25/11/13 12:19:48 WARN TaskSetManager: Stage 134 contains a task of very large size (4261 KiB). The maximum recommended task size is 1000 KiB.
25/11/13 12:19:50 WARN TaskSetManager: Stage 136 contains a task of very large size (4261 KiB). The maximum recommended task size is 1000 KiB.
25/11/13 12:21:04 WARN TaskSetManager: Stage 142 contains a task of very large size (4261 KiB). The maximum recommended task size is 1000 KiB.

# 命名实体识别（NER）训练示例
# 本代码块演示如何使用 Spark NLP 训练条件随机场（CRF）命名实体识别器（NerCrfApproach）。
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. SentenceDetector：检测句子边界，将文本拆分为句子。
# 3. Tokenizer：将句子分词为 token。
# 4. PerceptronModel.pretrained：加载预训练的感知机词性标注器，进行词性标注。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from pyspark.ml import Pipeline

# 下面这些注释器定义仅在你需要从原始文本构建 CoNLL 格式训练数据时使用，
# 如果你的训练数据已经包含 sentence、token、pos、label 等注释列，则无需重复定义。
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

posTagger = PerceptronModel.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("pos")

# 命名实体识别（NER）训练示例（CRF算法）
# 本代码块演示如何使用 Spark NLP 训练条件随机场（CRF）命名实体识别器（NerCrfApproach）。
# 包含的流程如下：
# 1. WordEmbeddingsModel：加载预训练词向量模型，将 token 映射为向量（embeddings）。
# 2. NerCrfApproach：基于 CRF 算法的命名实体识别器，设置输入列（sentence、token、pos、embeddings）、标签列（label）、训练轮数及输出列（ner）。
# 3. Pipeline：将词向量和 NER 标注器组合为管道。
# 4. CoNLL().readDataset：加载 CoNLL 2003 格式的训练数据集，包含句子、分词、词性、标签等字段。
# 5. pipeline.fit(trainingData)：使用训练数据集训练 NER 模型。
#Then training can start:

embeddings = WordEmbeddingsModel.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings") \
    .setCaseSensitive(False)

nerTagger = NerCrfApproach() \
    .setInputCols(["sentence", "token", "pos", "embeddings"]) \
    .setLabelColumn("label") \
    .setMinEpochs(1) \
    .setMaxEpochs(3) \
    .setOutputCol("ner")

pipeline = Pipeline().setStages([
    embeddings,
    nerTagger
])

# We use the sentences, tokens, POS tags and labels from the CoNLL dataset.

conll = CoNLL()
trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
pipelineModel = pipeline.fit(trainingData)

# 命名实体识别（NER）深度学习训练示例（NerDLApproach）
# 本代码块演示如何使用 Spark NLP 训练基于深度学习的命名实体识别器（NerDLApproach）。
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. SentenceDetector：检测句子边界，将文本拆分为句子。
# 3. Tokenizer：将句子分词为 token。
# 4. WordEmbeddings/BertEmbeddings：将 token 映射为词向量（embeddings）。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from pyspark.ml import Pipeline

# This CoNLL dataset already includes a sentence, token and label
# column with their respective annotator types. If a custom dataset is used,
# these need to be defined with for example:

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

# 5. NerDLApproach：基于深度学习的命名实体识别器，
# 设置输入列（sentence、token、embeddings）、标签列（label）、训练轮数及输出列（ner）。
# 6. Pipeline：将上述组件组合为管道，进行训练和推理。

embeddings = BertEmbeddings.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

nerTagger = NerDLApproach() \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("label") \
    .setOutputCol("ner") \
    .setMaxEpochs(1) \
    .setRandomSeed(0) \
    .setVerbose(0)

pipeline = Pipeline().setStages([
    embeddings,
    nerTagger
])

# We use the sentences, tokens, and labels from the CoNLL dataset.

conll = CoNLL()
trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
pipelineModel = pipeline.fit(trainingData)

# 文本分类器训练示例（ClassifierDLApproach）
# 本代码块演示如何使用 Spark NLP 训练基于深度学习的文本分类器（ClassifierDLApproach）。
# 包含的流程如下：
# 1. 读取带有文本和标签的训练数据集（CSV格式）。
# 2. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 3. UniversalSentenceEncoder：将 Document 注释转换为句子向量（sentence_embeddings）。
# 4. ClassifierDLApproach：基于深度学习的文本分类器，设置输入输出列、标签列、批量大小、训练轮数、学习率和 dropout。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

smallCorpus = spark.read.option("header","True").csv("src/test/resources/classifier/sentiment.csv")

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

useEmbeddings = UniversalSentenceEncoder.pretrained() \
    .setInputCols("document") \
    .setOutputCol("sentence_embeddings")

docClassifier = ClassifierDLApproach() \
    .setInputCols("sentence_embeddings") \
    .setOutputCol("category") \
    .setLabelColumn("label") \
    .setBatchSize(64) \
    .setMaxEpochs(20) \
    .setLr(5e-3) \
    .setDropout(0.5)

pipeline = Pipeline() \
    .setStages(
      [
        documentAssembler,
        useEmbeddings,
        docClassifier
      ]
    )

pipelineModel = pipeline.fit(smallCorpus)

# ViveknSentimentApproach 情感分析训练示例
# 本代码块演示如何使用 Spark NLP 训练基于 Vivekn 算法的情感分析器。
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. Tokenizer：将 Document 注释分词为 token。
# 3. Normalizer：对 token 进行标准化处理（如小写、去除标点）。
# 4. ViveknSentimentApproach：基于 Vivekn 算法的情感分析器，设置输入列、标签列和输出列。
# 5. Finisher：将情感分析结果转换为易于读取的格式，输出最终情感标签。
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

document = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

token = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normal")

vivekn = ViveknSentimentApproach() \
    .setInputCols(["document", "normal"]) \
    .setSentimentCol("train_sentiment") \
    .setOutputCol("result_sentiment")

finisher = Finisher() \
    .setInputCols(["result_sentiment"]) \
    .setOutputCols("final_sentiment")

# 构建情感分析管道并进行训练和推理
# 1. 定义管道阶段：文本输入、分词、标准化、情感分析、结果输出
# 2. 创建训练数据集，包含文本和情感标签（positive/negative）
# 3. 使用 Pipeline.fit() 训练情感分析模型
# 4. 创建测试数据集，进行情感预测
# 5. 展示最终情感分析结果
pipeline = Pipeline().setStages([document, token, normalizer, vivekn, finisher])

training = spark.createDataFrame([
    ("I really liked this movie!", "positive"),
    ("The cast was horrible", "negative"),
    ("Never going to watch this again or recommend it to anyone", "negative"),
    ("It's a waste of time", "negative"),
    ("I loved the protagonist", "positive"),
    ("The music was really really good", "positive")
]).toDF("text", "train_sentiment")
pipelineModel = pipeline.fit(training)

data = spark.createDataFrame([
    ["I recommend this movie"],
    ["Dont waste your time!!!"]
]).toDF("text")
result = pipelineModel.transform(data)

result.select("final_sentiment").show(truncate=False)

+---------------+
|final_sentiment|
+---------------+
|[positive]     |
|[negative]     |
+---------------+

# 文档向量训练示例（Doc2VecApproach）
# 本代码块演示如何使用 Spark NLP 训练 Doc2Vec 文档向量模型。
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. Tokenizer：将 Document 注释分词为 token。
# 3. Doc2VecApproach：根据分词结果训练文档向量模型，输出 embeddings 列。

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddings = Doc2VecApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("embeddings")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      tokenizer,
      embeddings
    ])

path = "sherlockholmes.txt"
dataset = spark.read.text(path).toDF("text")
pipelineModel = pipeline.fit(dataset)

# 词向量训练示例（Word2VecApproach）
# 本代码块演示如何使用 Spark NLP 训练 Word2Vec 词向量模型。
# 包含的流程如下：
# 1. DocumentAssembler：将原始文本转换为 Document 类型注释，作为管道输入。
# 2. Tokenizer：将 Document 注释分词为 token。
# 3. Word2VecApproach：根据分词结果训练词向量模型，输出 embeddings 列。
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

embeddings = Word2VecApproach() \
    .setInputCols(["token"]) \
    .setOutputCol("embeddings")

pipeline = Pipeline() \
    .setStages([
      documentAssembler,
      tokenizer,
      embeddings
    ])

path = "sherlockholmes.txt"
dataset = spark.read.text(path).toDF("text")
pipelineModel = pipeline.fit(dataset)

大数据分析与挖掘¶

09. Spark NLP Models Hub¶

Spark NLP Models Hub介绍¶

安装Spark NLP¶

从 Python 启动 Spark NLP 会话¶

概念¶

Annotations（注释数据结构）¶

Annotators（注释器）¶

快速标注文本¶

Explain Document ML¶

下载和使用预训练管线¶

使用预训练管线处理 Spark DataFrame¶

操作管道¶

搭建自定义管道¶

必要导入¶

DocumentAssembler：数据输入¶

句子检测与分词¶

Finisher：输出注释结果¶

使用 Spark ML Pipeline¶

使用 Spark NLP 的 LightPipeline¶

训练注释器¶

训练方法论¶

Spark NLP 导入说明¶

Spark ML Pipelines¶

LightPipeline¶

Spark NLP - 训练¶

训练数据集¶

POS 数据集¶

CoNLL 数据集¶

CoNLL-U 数据集¶

PubTator 数据集¶

文本处理¶

DependencyParserApproach（依存句法分析器）¶

Lemmatizer（词形还原器）¶

PerceptronApproach（感知机词性标注器）¶

SentenceDetectorDLApproach（深度学习句子检测器）¶

TypedDependencyParser（有标签依存句法分析器）¶

WordSegmenterApproach（分词器）¶

拼写检查器¶

ContextSpellCheckerApproach¶

NorvigSweeting 拼写检查器¶

SymmetricDelete 拼写检查器¶

Token Classification（标注分类）¶

NerCrfApproach¶

NerDLApproach¶

文本分类¶

ClassifierDLApproach¶

ViveknSentimentApproach¶

文本表示（Text Representation）¶

Doc2VecApproach¶

应用场景¶

Word2VecApproach¶