Implement long polling to allow web crawling of websites

This commit is contained in:
David Botton 2022-04-11 23:41:51 -04:00
parent 0134558c03
commit ad65de9f18
5 changed files with 175 additions and 93 deletions

View file

@ -11,8 +11,8 @@
:pathname "source/" :pathname "source/"
:depends-on (#:clack #:websocket-driver #:alexandria #:hunchentoot #:cl-ppcre :depends-on (#:clack #:websocket-driver #:alexandria #:hunchentoot #:cl-ppcre
#:bordeaux-threads #:trivial-open-browser #:parse-float #:quri #:bordeaux-threads #:trivial-open-browser #:parse-float #:quri
#:lack-middleware-static #:lack-request #:mgl-pax #:cl-template #:lack-middleware-static #:lack-request #:lack-util-writer-stream
#:closer-mop #:closer-mop #:mgl-pax #:cl-template
#:sqlite #:cl-dbi) #:sqlite #:cl-dbi)
:components ((:file "clog-connection") :components ((:file "clog-connection")
(:file "clog") (:file "clog")

View file

@ -98,6 +98,11 @@ script."
(defvar *url-to-boot-file* (make-hash-table* :test 'equalp) "URL to boot-file") (defvar *url-to-boot-file* (make-hash-table* :test 'equalp) "URL to boot-file")
(defvar *long-poll-first* nil
"Dynamic variable indicating to use html output instead of
websocket for output")
(defvar *long-poll-url* nil
"Dynamic variable indicating the url path used.")
;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;
;; generate-id ;; ;; generate-id ;;
@ -305,6 +310,7 @@ the default answer. (Private)"
(port 8080) (port 8080)
(server :hunchentoot) (server :hunchentoot)
(extended-routing nil) (extended-routing nil)
(long-poll-first nil)
(boot-file "/boot.html") (boot-file "/boot.html")
(boot-function nil) (boot-function nil)
(static-boot-html nil) (static-boot-html nil)
@ -313,16 +319,19 @@ the default answer. (Private)"
"Initialize CLOG on a socket using HOST and PORT to serve BOOT-FILE "Initialize CLOG on a socket using HOST and PORT to serve BOOT-FILE
as the default route for '/' to establish web-socket connections and as the default route for '/' to establish web-socket connections and
static files located at STATIC-ROOT. The webserver used with CLACK can static files located at STATIC-ROOT. The webserver used with CLACK can
be chosed with :SERVER. If BOOT-FILE is nil no initial clog-path's be chosed with :SERVER. If LONG-POLLING-FIRST is t, the output is sent
will be setup, use clog-path to add. The on-connect-handler needs to as HTML instead of websocket commands until on-new-window-handler
indentify the path by querying the browser. See PATH-NAME (in ends, this should be used in webserver applications to enable crawling
CLOG-LOCATION). If EXTENDED-ROUTING is t routes will match even if of your website. If BOOT-FILE is nil no initial clog-path's will be
extend with additional / and additional paths. If static-boot-js is setup, use clog-path to add. The on-connect-handler needs to indentify
nil then boot.js is served from the file /js/boot.js instead of the the path by querying the browser. See PATH-NAME (in CLOG-LOCATION). If
compiled version. If static-boot-html is t if boot.html is not present EXTENDED-ROUTING is t routes will match even if extend with additional
will use compiled version. boot-function if set is called with the url / and additional paths. If static-boot-js is nil then boot.js is
and the contents of boot-file and its return value replaces the served from the file /js/boot.js instead of the compiled version. If
contents sent to the brower." static-boot-html is t if boot.html is not present will use compiled
version. boot-function if set is called with the url and the contents
of boot-file and its return value replaces the contents sent to the
brower."
(set-on-connect on-connect-handler) (set-on-connect on-connect-handler)
(when boot-file (when boot-file
(set-clog-path "/" boot-file)) (set-clog-path "/" boot-file))
@ -334,19 +343,19 @@ contents sent to the brower."
(if (and (eq static-boot-js nil) (if (and (eq static-boot-js nil)
(equalp (getf env :path-info) "/js/boot.js")) (equalp (getf env :path-info) "/js/boot.js"))
`(200 (:content-type "text/javascript") `(200 (:content-type "text/javascript")
(,(compiled-boot-js))) (,(compiled-boot-js)))
(funcall app env)))) (funcall app env))))
(lambda (app) (lambda (app)
(lambda (env) (lambda (env)
;; Special handling of "clog paths" ;; Special handling of "clog paths"
(let ((clog-path (gethash (getf env :path-info) (let* ((url-path (getf env :path-info))
*url-to-boot-file*))) (clog-path (gethash url-path *url-to-boot-file*)))
(unless clog-path (unless clog-path
(when extended-routing (when extended-routing
(maphash (lambda (k v) (maphash (lambda (k v)
(unless (equal k "/") (unless (equal k "/")
(when (ppcre:scan (format nil "^~A/" k) (when (ppcre:scan (format nil "^~A/" k)
(getf env :path-info)) url-path)
(setf clog-path v)))) (setf clog-path v))))
*url-to-boot-file*))) *url-to-boot-file*)))
(cond (clog-path (cond (clog-path
@ -360,10 +369,10 @@ contents sent to the brower."
(compiled-boot-html nil nil)))) (compiled-boot-html nil nil))))
(post-data nil)) (post-data nil))
(when stream (when stream
(read-sequence page-data stream)) (read-sequence page-data stream))
(when boot-function (when boot-function
(setf page-data (funcall boot-function (setf page-data (funcall boot-function
(getf env :path-info) url-path
page-data))) page-data)))
(when (search "multipart/form-data;" (when (search "multipart/form-data;"
(getf env :content-type)) (getf env :content-type))
@ -376,12 +385,46 @@ contents sent to the brower."
"application/x-www-form-urlencoded") "application/x-www-form-urlencoded")
(setf post-data (make-string (getf env :content-length))) (setf post-data (make-string (getf env :content-length)))
(read-sequence post-data (getf env :raw-body))) (read-sequence post-data (getf env :raw-body)))
`(200 (:content-type "text/html") (cond (long-poll-first
(,(if post-data (let ((id (generate-id)))
(concatenate 'string page-data (setf (gethash id *connection-data*) (make-hash-table* :test #'equal))
(format nil "<script>clog['post-data']='~A'</script>" (setf (gethash "connection-id" (get-connection-data id)) id)
post-data)) (format t "New html connection id - ~A~%" id)
page-data))))))) (lambda (responder)
(let* ((writer (funcall responder '(200 (:content-type "text/html"))))
(stream (lack.util.writer-stream:make-writer-stream writer))
(*long-poll-url* url-path)
(*long-poll-first* stream))
(write-sequence page-data stream)
(write-sequence
(format nil "<script>clog['connection_id']=~A;Open_ws();</script>" id)
stream)
(when post-data
(write-sequence
(format nil "<script>clog['post-data']='~A'</script>"
post-data)
stream))
(if *break-on-error*
(funcall *on-connect-handler* id)
(handler-case
(funcall *on-connect-handler* id)
(t (c)
(format t "Condition caught connection ~A - ~A.~&" id c)
(values 0 c))))
(when *long-poll-first*
(finish-output stream))
(format t "HTML connection closed - ~A~%" id)))))
(t
(lambda (responder)
(let* ((writer (funcall responder '(200 (:content-type "text/html"))))
(stream (lack.util.writer-stream:make-writer-stream writer)))
(write-sequence page-data stream)
(when post-data
(write-sequence
(format nil "<script>clog['post-data']='~A'</script>"
post-data)
stream))
(finish-output stream)))))))))
;; Pass the handling on to next rule ;; Pass the handling on to next rule
(t (funcall app env)))))) (t (funcall app env))))))
(:static :path (lambda (path) (:static :path (lambda (path)
@ -395,7 +438,10 @@ contents sent to the brower."
(clog-server env)))) (clog-server env))))
(setf *client-handler* (clack:clackup *app* :server server :address host :port port)) (setf *client-handler* (clack:clackup *app* :server server :address host :port port))
(format t "HTTP listening on : ~A:~A~%" host port) (format t "HTTP listening on : ~A:~A~%" host port)
(format t "HTML Root : ~A~%" static-root) (format t "HTML root : ~A~%" static-root)
(format t "Long poll first : ~A~%" (if long-poll-first
"yes"
"no"))
(format t "Boot function added : ~A~%" (if boot-function (format t "Boot function added : ~A~%" (if boot-function
"yes" "yes"
"no")) "no"))
@ -464,9 +510,12 @@ contents sent to the brower."
(defun execute (connection-id message) (defun execute (connection-id message)
"Execute SCRIPT on CONNECTION-ID, disregard return value." "Execute SCRIPT on CONNECTION-ID, disregard return value."
(let ((con (get-connection connection-id))) (if *long-poll-first*
(when con (write-sequence (format nil "<script>~A</script>" message)
(websocket-driver:send con message)))) *long-poll-first*)
(let ((con (get-connection connection-id)))
(when con
(websocket-driver:send con message)))))
;;;;;;;;;;; ;;;;;;;;;;;
;; query ;; ;; query ;;
@ -475,6 +524,16 @@ contents sent to the brower."
(defun query (connection-id script &key (default-answer nil)) (defun query (connection-id script &key (default-answer nil))
"Execute SCRIPT on CONNECTION-ID, return value. If times out answer "Execute SCRIPT on CONNECTION-ID, return value. If times out answer
DEFAULT-ANSWER." DEFAULT-ANSWER."
;; Provide delay if needed to establish websocket connection for
;; response.
(when *long-poll-first*
(finish-output *long-poll-first*)
(loop
for n from 1 to 10 do
(let ((con (get-connection connection-id)))
(when con
(return))
(sleep .1))))
(let ((uid (generate-id))) (let ((uid (generate-id)))
(prep-query uid (when default-answer (format nil "~A" default-answer))) (prep-query uid (when default-answer (format nil "~A" default-answer)))
(execute connection-id (execute connection-id
@ -594,10 +653,23 @@ the browser contents in case of connection loss."
(defun compiled-boot-js () (defun compiled-boot-js ()
"Returns a compiled version of current version of boot.js (private)" "Returns a compiled version of current version of boot.js (private)"
"var ws; "
/*compiled version*/
var ws=null;
var adr; var adr;
var clog={}; var clog={};
var pingerid; var pingerid;
var s = document.location.search;
var tokens;
var r = /[?&]?([^=]+)=([^&]*)/g;
clog['body']=document.body;
clog['head']=document.head;
clog['documentElement']=document.documentElement;
clog['window']=window;
clog['navigator']=navigator;
clog['document']=window.document;
clog['location']=window.location;
if (typeof clog_debug == 'undefined') { if (typeof clog_debug == 'undefined') {
clog_debug = false; clog_debug = false;
@ -651,7 +723,7 @@ function Setup_ws() {
ws.onclose = function (event) { ws.onclose = function (event) {
console.log ('onclose: reconnect'); console.log ('onclose: reconnect');
ws = null; ws = null;
ws = new WebSocket (adr + '?r=' + clog['connection_id']); ws = new WebSocket (adr + '?r=' + clog['connection_id']);
ws.onopen = function (event) { ws.onopen = function (event) {
console.log ('onclose: reconnect successful'); console.log ('onclose: reconnect successful');
Setup_ws(); Setup_ws();
@ -663,43 +735,39 @@ function Setup_ws() {
} }
} }
$( document ).ready(function() { function Open_ws() {
var s = document.location.search;
var tokens;
var r = /[?&]?([^=]+)=([^&]*)/g;
clog['body']=document.body;
clog['head']=document.head;
clog['documentElement']=document.documentElement;
clog['window']=window;
clog['navigator']=navigator;
clog['document']=window.document;
clog['location']=window.location;
if (location.protocol == 'https:') { if (location.protocol == 'https:') {
adr = 'wss://' + location.hostname; adr = 'wss://' + location.hostname;
} else { } else {
adr = 'ws://' + location.hostname; adr = 'ws://' + location.hostname;
} }
if (location.port != '') { adr = adr + ':' + location.port; } if (location.port != '') { adr = adr + ':' + location.port; }
adr = adr + '/clog'; adr = adr + '/clog';
if (clog['connection_id']) { adr = adr + '?r=' + clog['connection_id'] }
try { try {
console.log ('connecting to ' + adr); console.log ('connecting to ' + adr);
ws = new WebSocket (adr); ws = new WebSocket (adr);
} catch (e) { } catch (e) {
console.log ('trying again, connecting to ' + adr); console.log ('trying again, connecting to ' + adr);
ws = new WebSocket (adr); ws = new WebSocket (adr);
} }
if (ws != null) { if (ws != null) {
ws.onopen = function (event) { ws.onopen = function (event) {
console.log ('connection successful'); console.log ('connection successful');
Setup_ws(); Setup_ws();
} }
pingerid = setInterval (function () {Ping_ws ();}, 10000); pingerid = setInterval (function () {Ping_ws ();}, 10000);
} else { } else {
document.writeln ('If you are seeing this your browser or your connection to the internet is blocking websockets.'); document.writeln ('If you are seeing this your browser or your connection to the internet is blocking websockets.');
} }
});") }
$( document ).ready(function() {
if (ws == null) { Open_ws(); }
});
")

View file

@ -37,7 +37,9 @@ the same as the clog directy this overides the relative paths used in them.")
(when clog-connection:*verbose-output* (when clog-connection:*verbose-output*
(format t "Start new window handler on connection-id - ~A" connection-id)) (format t "Start new window handler on connection-id - ~A" connection-id))
(let ((body (make-clog-body connection-id))) (let ((body (make-clog-body connection-id)))
(let* ((path (path-name (location body))) (let* ((path (if clog-connection::*long-poll-url*
clog-connection::*long-poll-url*
(path-name (location body))))
(on-new-window (gethash path *url-to-on-new-window*))) (on-new-window (gethash path *url-to-on-new-window*)))
(unless on-new-window (unless on-new-window
(when *extended-routing* (when *extended-routing*
@ -64,6 +66,7 @@ the same as the clog directy this overides the relative paths used in them.")
(port 8080) (port 8080)
(server :hunchentoot) (server :hunchentoot)
(extended-routing nil) (extended-routing nil)
(long-poll-first nil)
(boot-file "/boot.html") (boot-file "/boot.html")
(boot-function nil) (boot-function nil)
(static-boot-html nil) (static-boot-html nil)
@ -74,9 +77,12 @@ the same as the clog directy this overides the relative paths used in them.")
as the default route to establish web-socket connections and static as the default route to establish web-socket connections and static
files located at STATIC-ROOT. The webserver used with CLACK can be files located at STATIC-ROOT. The webserver used with CLACK can be
chosed with :SERVER. If EXTENDED-ROUTING is t routes will match even chosed with :SERVER. If EXTENDED-ROUTING is t routes will match even
if extend with additional / and additional paths. If CLOG was already if extend with additional / and additional paths. If
initialized and not shut down, this function does the same as LONG-POLLING-FIRST is t, the output is sent as HTML instead of
set-on-new-window (does not change the static-root). If websocket commands until on-new-window-handler ends, this should be
used in webserver applications to enable crawling of your website. If
CLOG was already initialized and not shut down, this function does the
same as set-on-new-window (does not change the static-root). If
ON-NEW-WINDOW-HANDLER is nil no handler is set and none is ON-NEW-WINDOW-HANDLER is nil no handler is set and none is
removed. STATIC-ROOT by default is the \"directory CLOG is installed removed. STATIC-ROOT by default is the \"directory CLOG is installed
in ./static-files\" If the variable clog:*overide-static-root* is set in ./static-files\" If the variable clog:*overide-static-root* is set

View file

@ -1,8 +1,19 @@
/*static version*/ /*static version*/
var ws; var ws=null;
var adr; var adr;
var clog={}; var clog={};
var pingerid; var pingerid;
var s = document.location.search;
var tokens;
var r = /[?&]?([^=]+)=([^&]*)/g;
clog['body']=document.body;
clog['head']=document.head;
clog['documentElement']=document.documentElement;
clog['window']=window;
clog['navigator']=navigator;
clog['document']=window.document;
clog['location']=window.location;
if (typeof clog_debug == 'undefined') { if (typeof clog_debug == 'undefined') {
clog_debug = false; clog_debug = false;
@ -56,7 +67,7 @@ function Setup_ws() {
ws.onclose = function (event) { ws.onclose = function (event) {
console.log ('onclose: reconnect'); console.log ('onclose: reconnect');
ws = null; ws = null;
ws = new WebSocket (adr + '?r=' + clog['connection_id']); ws = new WebSocket (adr + '?r=' + clog['connection_id']);
ws.onopen = function (event) { ws.onopen = function (event) {
console.log ('onclose: reconnect successful'); console.log ('onclose: reconnect successful');
Setup_ws(); Setup_ws();
@ -68,43 +79,37 @@ function Setup_ws() {
} }
} }
$( document ).ready(function() { function Open_ws() {
var s = document.location.search;
var tokens;
var r = /[?&]?([^=]+)=([^&]*)/g;
clog['body']=document.body;
clog['head']=document.head;
clog['documentElement']=document.documentElement;
clog['window']=window;
clog['navigator']=navigator;
clog['document']=window.document;
clog['location']=window.location;
if (location.protocol == 'https:') { if (location.protocol == 'https:') {
adr = 'wss://' + location.hostname; adr = 'wss://' + location.hostname;
} else { } else {
adr = 'ws://' + location.hostname; adr = 'ws://' + location.hostname;
} }
if (location.port != '') { adr = adr + ':' + location.port; } if (location.port != '') { adr = adr + ':' + location.port; }
adr = adr + '/clog'; adr = adr + '/clog';
if (clog['connection_id']) { adr = adr + '?r=' + clog['connection_id'] }
try { try {
console.log ('connecting to ' + adr); console.log ('connecting to ' + adr);
ws = new WebSocket (adr); ws = new WebSocket (adr);
} catch (e) { } catch (e) {
console.log ('trying again, connecting to ' + adr); console.log ('trying again, connecting to ' + adr);
ws = new WebSocket (adr); ws = new WebSocket (adr);
} }
if (ws != null) { if (ws != null) {
ws.onopen = function (event) { ws.onopen = function (event) {
console.log ('connection successful'); console.log ('connection successful');
Setup_ws(); Setup_ws();
} }
pingerid = setInterval (function () {Ping_ws ();}, 10000); pingerid = setInterval (function () {Ping_ws ();}, 10000);
} else { } else {
document.writeln ('If you are seeing this your browser or your connection to the internet is blocking websockets.'); document.writeln ('If you are seeing this your browser or your connection to the internet is blocking websockets.');
} }
}
$( document ).ready(function() {
if (ws == null) { Open_ws(); }
}); });

View file

@ -34,12 +34,12 @@
<ul> <ul>
<li><a href='/page1'>/page1</a> - a CLOG app <li><a href='/page1'>/page1</a> - a CLOG app
<li><a href='/page1.html'>/page1.html</a> - a CLOG app mascarading as a .html <li><a href='/page1.html'>/page1.html</a> - a CLOG app mascarading as a .html
<li><a href='/somepath/hi/'>/somepath/hi/</a> - a CLOG app mascarading as a .html <li><a href='/somepath/hi/'>/somepath/hi/</a> - deeper paths
<li><a href='/page2'>/page2</a> - a CLOG app using an alternative boot file <li><a href='/page2'>/page2</a> - a CLOG app using an alternative boot file
<li><a href='/page3'>/page3</a> - tutorial 11 as part of this tutorial <li><a href='/page3'>/page3</a> - tutorial 11 as part of this tutorial
<li><a href='/tutorial/tut-11.html'>/tutorial/tut-11.html</a> - an html file using boot.js <li><a href='/tutorial/tut-11.html'>/tutorial/tut-11.html</a> - an html file using boot.js
<li><a href='/tutorial/some-file.html'>/tutorial/some-file.html</a> - an html file using boot.js <li><a href='/tutorial/some-file.html'>/tutorial/some-file.html</a> - an html file using boot.js
<li><a href='/tutorial/regular-file.html'>'/tutorial/regular-file.html</a> - a regular html file <li><a href='/tutorial/regular-file.html'>/tutorial/regular-file.html</a> - a regular html file
</ul>")) </ul>"))
(defun on-page1 (body) (defun on-page1 (body)
@ -100,9 +100,12 @@
"Start turtorial." "Start turtorial."
;; Setup the default route / to on-main ;; Setup the default route / to on-main
;; :boot-function allows us to add or modify our boot-files content ;; :boot-function allows us to add or modify our boot-files content
;; for search engine optimization ;; for search engine optimization. We choose long-polling-first so
(initialize 'on-main :boot-function 'add-search-optimizations ;; our website can be crawled for content by google
:extended-routing t) (initialize 'on-main
:long-poll-first t
:boot-function 'add-search-optimizations
:extended-routing t)
;; Navigating to http://127.0.0.1:8080/page1 executes on-page1 ;; Navigating to http://127.0.0.1:8080/page1 executes on-page1
;; Since extended-routing is t /page1/any/thing/else also routes to /page1 ;; Since extended-routing is t /page1/any/thing/else also routes to /page1
(set-on-new-window 'on-page1 :path "/page1") (set-on-new-window 'on-page1 :path "/page1")