-- jsoup download: <https://jsoup.org/download> (e.g. jsoup-1-15.3.jar, released August 24th, 2022)
-- in $HOME/BSF4ooRexx/lib kopieren und schon geht's los ... ;)

-- <https://www.tutorialspoint.com/jsoup/jsoup_extract_text.htm>
/*

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class JsoupTester {
   public static void main(String[] args) {

      String html = "<html><head><title>Sample Title</title></head>"
         + "<body>"
         + "<p>Sample Content</p>"
         + "<div id='sampleDiv'><a href='www.google.com'>Google</a>"
         + "<h3><a>Sample</a><h3>"
         +"</div>"
         +"</body></html>";
      Document document = Jsoup.parse(html);

      //a with href
      Element link = document.select("a").first();

      System.out.println("Text: " + link.text());
   }
}
*/

clzJsoup = bsf.importClass("org.jsoup.Jsoup")   -- import Jsoup class
document = clzJsoup~parse(.resources~html~makeString)   -- get html text and parse it
link     = document~select("a")~first     -- get the first link in the document
say "link~hasAttr('href'):" link~hasAttr('href')
say "link~attr('href')   :" link~attr('href')
say "link~hasAttr('HrEf'):" link~hasAttr('HrEf')
say "link~attr('HrEf')   :" link~attr('HrEf')
say "link~text           :" link~text                     -- show the text of the link element

say "---"
say "document~html:" pp(document~html)
say "--- setting to xml ..."
--  document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
clzEntities=bsf.loadClass("org.jsoup.nodes.Entities")
document~outputSettings~escapeMode(clzEntities~EscapeMode~xhtml)
document~outputSettings~syntax(document~OutputSettings~Syntax~xml)

-- document.outputSettings().escapeMode(org.jsoup.nodes.Entities.EscapeMode.xhtml)

say "document~html (xhtml?):" pp(document~html)
say "---"
text="ber den W&ouml;lkchen muss di..."
esc=clzEntities~escape(text)
say "text          :" pp(text)
say "esc           :" pp(esc)
say "unescaped(esc):" pp(clzEntities~unescape(esc))


::requires "BSF.CLS"    -- get ooRexx-Java bridge

::resource html
   <html><head><title>A Very Important Title
   <body>
      <p>A paragraph with a closing tag </p>

      <p>Some paragraph with a non-breaking space entity &nbsp;)
         and NO closing tag

      <!-- an empty element: -->

      <br/>
      <div id='someId'><a href='https://www.RexxLA.org'>Rexx Language Association</a>
      <h3><a>Sample
      </div>
   </body></html>
::END
