;;; nxml-mode.el --- a new XML mode -*- lexical-binding:t -*-
;; Copyright (C) 2003-2004, 2007-2025 Free Software Foundation, Inc.
;; Author: James Clark
;; Keywords: text, hypermedia, languages, XML
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see .
;;; Commentary:
;; See nxml-rap.el for description of parsing strategy.
;;; Code:
(eval-when-compile (require 'cl-lib))
(require 'xmltok)
(require 'nxml-enc)
(require 'nxml-util)
(require 'nxml-rap)
(require 'nxml-outln)
;; nxml-mode calls rng-nxml-mode-init, which is autoloaded from rng-nxml.
;; So we might as well just require it and silence the compiler.
(provide 'nxml-mode) ; avoid recursive require
(require 'rng-nxml)
(require 'sgml-mode)
;;; Customization
(defgroup nxml nil
"New XML editing mode."
:link '(custom-manual "(nxml-mode) Top")
:group 'languages)
(defgroup nxml-faces nil
"Faces for XML syntax highlighting."
:group 'nxml)
(defcustom nxml-char-ref-display-glyph-flag t
"Non-nil means display glyph following character reference.
The glyph is displayed in face `nxml-glyph'."
:group 'nxml
:type 'boolean
:safe #'booleanp)
(defcustom nxml-sexp-element-flag t
"Non-nil means sexp commands treat an element as a single expression."
:version "27.1" ; nil -> t
:group 'nxml
:type 'boolean
:safe #'booleanp)
(defcustom nxml-slash-auto-complete-flag nil
"Non-nil means typing a slash automatically completes the end-tag.
This is used by `nxml-electric-slash'."
:group 'nxml
:type 'boolean
:safe #'booleanp)
(defcustom nxml-child-indent 2
"Indentation for the children of an element relative to the start-tag.
This only applies when the line or lines containing the start-tag contains
nothing else other than that start-tag."
:group 'nxml
:type 'integer
:safe #'integerp)
(defcustom nxml-attribute-indent 4
"Indentation for the attributes of an element relative to the start-tag.
This only applies when the first attribute of a tag starts a line.
In other cases, the first attribute on one line is indented the same
as the first attribute on the previous line."
:group 'nxml
:type 'integer
:safe #'integerp)
(defcustom nxml-bind-meta-tab-to-complete-flag t
"Non-nil means to use nXML completion in \\[completion-at-point]."
:group 'nxml
:type 'boolean
:safe #'booleanp)
(defcustom nxml-prefer-utf-16-to-utf-8-flag nil
"Non-nil means prefer UTF-16 to UTF-8 when saving a buffer.
This is used only when a buffer does not contain an encoding declaration
and when its current `buffer-file-coding-system' specifies neither UTF-16
nor UTF-8."
:group 'nxml
:type 'boolean
:safe #'booleanp)
(defcustom nxml-prefer-utf-16-little-to-big-endian-flag (eq system-type
'windows-nt)
"Non-nil means prefer little-endian to big-endian byte-order for UTF-16.
This is used only for saving a buffer; when reading the
byte-order is auto-detected. It may be relevant both when there
is no encoding declaration and when the encoding declaration
specifies `UTF-16'."
:group 'nxml
:type 'boolean
:safe #'booleanp)
(defcustom nxml-default-buffer-file-coding-system nil
"Default value for `buffer-file-coding-system' for a buffer for a new file.
A value of nil means use the default value of
`buffer-file-coding-system' as normal.
A buffer's `buffer-file-coding-system' affects what
\\[nxml-insert-xml-declaration] inserts."
:group 'nxml
:type 'coding-system
:safe #'coding-system-p)
(defcustom nxml-auto-insert-xml-declaration-flag nil
"Non-nil means automatically insert an XML declaration in a new file.
The XML declaration is inserted using `nxml-insert-xml-declaration'."
:group 'nxml
:type 'boolean
:safe #'booleanp)
(defface nxml-delimited-data
'((t (:inherit font-lock-doc-face)))
"Face used to highlight data enclosed between delimiters.
This is not used directly, but only via inheritance by other faces."
:group 'nxml-faces)
(defface nxml-name
'((t (:inherit font-lock-builtin-face)))
"Face used to highlight various names.
This includes element and attribute names, processing
instruction targets and the CDATA keyword in a CDATA section.
This is not used directly, but only via inheritance by other faces."
:group 'nxml-faces)
(defface nxml-ref
'((t (:inherit font-lock-constant-face)))
"Face used to highlight character and entity references.
This is not used directly, but only via inheritance by other faces."
:group 'nxml-faces)
(defface nxml-text
'((t (:inherit default)))
"Face used to highlight text."
:group 'nxml-faces)
(defface nxml-delimiter
'((t (:inherit nxml-text)))
"Face used to highlight delimiters.
This is not used directly, but only via inheritance by other faces."
:group 'nxml-faces)
(defface nxml-processing-instruction-delimiter
'((t (:inherit nxml-delimiter)))
"Face used for the delimiters of processing instructions, i.e., and ?>."
:group 'nxml-faces)
(defface nxml-processing-instruction-target
'((t (:inherit font-lock-keyword-face)))
"Face used for the target of processing instructions."
:group 'nxml-faces)
(defface nxml-processing-instruction-content
'((t (:inherit nxml-delimited-data)))
"Face used for the content of processing instructions."
:group 'nxml-faces)
(defface nxml-cdata-section-delimiter
'((t (:inherit nxml-delimiter)))
"Face used for the delimiters of CDATA sections, i.e., ."
:group 'nxml-faces)
(defface nxml-cdata-section-CDATA
'((t (:inherit nxml-name)))
"Face used for the CDATA keyword in CDATA sections."
:group 'nxml-faces)
(defface nxml-cdata-section-content
'((t (:inherit nxml-text)))
"Face used for the content of CDATA sections."
:group 'nxml-faces)
(defface nxml-char-ref-number
'((t (:inherit nxml-ref)))
"Face used for the number in character references.
This includes the `x' in hex references."
:group 'nxml-faces)
(defface nxml-char-ref-delimiter
'((t (:inherit nxml-ref)))
"Face used for the delimiters of character references, i.e., and ;."
:group 'nxml-faces)
(defface nxml-entity-ref-name
'((t (:inherit nxml-ref)))
"Face used for the entity name in general entity references."
:group 'nxml-faces)
(defface nxml-entity-ref-delimiter
'((t (:inherit nxml-ref)))
"Face used for the delimiters of entity references, i.e., & and ;."
:group 'nxml-faces)
(defface nxml-tag-delimiter
'((t (:inherit nxml-delimiter)))
"Face used for the angle brackets delimiting tags.
`nxml-tag-slash' is used for slashes."
:group 'nxml-faces)
(defface nxml-tag-slash
'((t (:inherit nxml-tag-delimiter)))
"Face used for slashes in tags, both in end-tags and empty-elements."
:group 'nxml-faces)
(defface nxml-element-prefix
'((t (:inherit nxml-name)))
"Face used for the prefix of elements."
:group 'nxml-faces)
(defface nxml-element-colon
'((t (:inherit nxml-delimiter)))
"Face used for the colon in element names."
:group 'nxml-faces)
(defface nxml-element-local-name
'((t (:inherit font-lock-function-name-face)))
"Face used for the local name of elements."
:group 'nxml-faces)
(defface nxml-attribute-prefix
'((t (:inherit nxml-name)))
"Face used for the prefix of attributes."
:group 'nxml-faces)
(defface nxml-attribute-colon
'((t (:inherit nxml-delimiter)))
"Face used for the colon in attribute names."
:group 'nxml-faces)
(defface nxml-attribute-local-name
'((t (:inherit font-lock-variable-name-face)))
"Face used for the local name of attributes."
:group 'nxml-faces)
(defface nxml-namespace-attribute-xmlns
'((t (:inherit nxml-attribute-prefix)))
"Face used for `xmlns' in namespace attributes."
:group 'nxml-faces)
(defface nxml-namespace-attribute-colon
'((t (:inherit nxml-attribute-colon)))
"Face used for the colon in namespace attributes."
:group 'nxml-faces)
(defface nxml-namespace-attribute-prefix
'((t (:inherit nxml-attribute-local-name)))
"Face used for the prefix declared in namespace attributes."
:group 'nxml-faces)
(defface nxml-attribute-value
'((t (:inherit font-lock-string-face)))
"Face used for the value of attributes."
:group 'nxml-faces)
(defface nxml-attribute-value-delimiter
'((t (:inherit nxml-attribute-value)))
"Face used for the delimiters of attribute values."
:group 'nxml-faces)
(defface nxml-prolog-literal-delimiter
'((t (:inherit nxml-delimited-data)))
"Face used for the delimiters of literals in the prolog."
:group 'nxml-faces)
(defface nxml-prolog-literal-content
'((t (:inherit nxml-delimited-data)))
"Face used for the content of literals in the prolog."
:group 'nxml-faces)
(defface nxml-prolog-keyword
'((t (:inherit font-lock-keyword-face)))
"Face used for keywords in the prolog."
:group 'nxml-faces)
(defface nxml-markup-declaration-delimiter
'((t (:inherit nxml-delimiter)))
"Face used for the delimiters of markup declarations in the prolog.
The delimiters are ."
:group 'nxml-faces)
(defface nxml-hash
'((t (:inherit nxml-name)))
"Face used for # before a name in the prolog."
:group 'nxml-faces)
(defface nxml-glyph
'((((type x))
(:family
"misc-fixed"
:background
"light grey"
:foreground
"black"
:weight
normal
:slant
normal))
(t
(:background
"light grey"
:foreground
"black"
:weight
normal
:slant
normal)))
"Face used for glyph for char references."
:group 'nxml-faces)
;;; Global variables
(defvar-local nxml-parent-document nil
"The parent document for a part of a modular document.
Use `nxml-parent-document-set' to set it.")
(put 'nxml-parent-document 'safe-local-variable 'stringp)
(defvar-local nxml-prolog-regions nil
"List of regions in the prolog to be fontified.
See the function `xmltok-forward-prolog' for more information.")
(defvar-local nxml-degraded nil
"Non-nil if currently operating in degraded mode.
Degraded mode is enabled when an internal error is encountered in the
fontification or after-change functions.")
(defvar nxml-completion-hook nil
"Hook run by `nxml-complete'.
This hook is run until success.")
(defvar nxml-in-mixed-content-hook nil
"Hook to determine whether point is in mixed content.
The hook is called without arguments. It should return nil if it is
definitely not mixed; non-nil otherwise. The hook will be run until
one of the functions returns nil.")
(defvar nxml-mixed-scan-distance 4000
"Maximum distance from point to scan when checking for mixed content.")
(defvar nxml-end-tag-indent-scan-distance 4000
"Maximum distance from point to scan backwards when indenting end-tag.")
(defvar-local nxml-char-ref-extra-display t
"Non-nil means display extra information for character references.
The extra information consists of a tooltip with the character name
and, if `nxml-char-ref-display-glyph-flag' is non-nil, a glyph
corresponding to the referenced character following the character
reference.")
(defvar-keymap nxml-mode-map
:doc "Keymap for `nxml-mode'."
"C-M-u" #'nxml-backward-up-element
"C-M-d" #'nxml-down-element
"C-M-n" #'nxml-forward-element
"C-M-p" #'nxml-backward-element
"M-{" #'nxml-backward-paragraph
"M-}" #'nxml-forward-paragraph
"M-h" #'nxml-mark-paragraph
"C-c C-f" #'nxml-finish-element
"C-c ]" #'nxml-finish-element
"C-c /" #'nxml-finish-element
"C-c C-m" #'nxml-split-element
"C-c C-b" #'nxml-balanced-close-start-tag-block
"C-c C-i" #'nxml-balanced-close-start-tag-inline
"C-c C-x" #'nxml-insert-xml-declaration
"C-c C-d" #'nxml-dynamic-markup-word
;; u is for Unicode
"C-c C-u" #'nxml-insert-named-char
"C-c C-o" nxml-outline-prefix-map
"/" #'nxml-electric-slash
"S-" #'nxml-mouse-hide-direct-text-content)
(defvar nxml-font-lock-keywords
'(nxml-fontify-matcher)
"Default font lock keywords for `nxml-mode'.")
(defsubst nxml-set-face (start end face)
(when (and face (< start end))
;; Prepend, so the character reference highlighting takes precedence over
;; the string highlighting applied syntactically.
(font-lock-prepend-text-property start end 'face face)))
(defun nxml-parent-document-set (parent-document)
"Set `nxml-parent-document' and inherit the DTD &c."
;; FIXME: this does not work.
;; the idea is that by inheriting some variables from the parent,
;; `rng-validate-mode' will validate entities declared in the parent.
;; alas, the most interesting variables (`rng-compile-table' et al)
;; are circular and cannot be printed even with `print-circle'.
(interactive "fParent document")
(let (dtd current-schema current-schema-file-name compile-table
ipattern-table last-ipattern-index)
(when (string= (file-truename parent-document)
(file-truename buffer-file-name))
(error "Parent document cannot be the same as the document"))
(with-current-buffer (find-file-noselect parent-document)
(setq dtd rng-dtd
current-schema rng-current-schema
current-schema-file-name rng-current-schema-file-name
compile-table rng-compile-table
ipattern-table rng-ipattern-table
last-ipattern-index rng-last-ipattern-index
parent-document buffer-file-name))
(setq rng-dtd dtd
rng-current-schema current-schema
rng-current-schema-file-name current-schema-file-name
rng-compile-table compile-table
rng-ipattern-table ipattern-table
rng-last-ipattern-index last-ipattern-index
nxml-parent-document parent-document)
(message "Set parent document to %s" parent-document)
(when rng-validate-mode
(rng-validate-while-idle (current-buffer)))))
(defvar nxml-prolog-end) ;; nxml-rap.el
(defun nxml-syntax-propertize (start end)
"Syntactic keywords for `nxml-mode'."
;; Like `sgml-syntax-propertize', but rescan prolog if needed.
(when (< start nxml-prolog-end)
(nxml-scan-prolog))
(sgml-syntax-propertize start end))
(defvar tildify-space-string)
(defvar tildify-foreach-region-function)
;;;###autoload
(define-derived-mode nxml-mode text-mode "nXML"
;; We use C-c C-i instead of \\[nxml-balanced-close-start-tag-inline]
;; because Emacs turns C-c C-i into C-c TAB which is hard to type and
;; not mnemonic.
"Major mode for editing XML.
\\
\\[nxml-finish-element] finishes the current element by inserting an end-tag.
C-c C-i closes a start-tag with `>' and then inserts a balancing end-tag
leaving point between the start-tag and end-tag.
\\[nxml-balanced-close-start-tag-block] is similar but for block rather than inline elements:
the start-tag, point, and end-tag are all left on separate lines.
If `nxml-slash-auto-complete-flag' is non-nil, then inserting a `'
automatically inserts the rest of the end-tag.
\\[completion-at-point] performs completion on the symbol preceding point.
\\[nxml-dynamic-markup-word] uses the contents of the current buffer
to choose a tag to put around the word preceding point.
Sections of the document can be displayed in outline form. The
variable `nxml-section-element-name-regexp' controls when an element
is recognized as a section. The same key sequences that change
visibility in outline mode are used except that they start with C-c C-o
instead of C-c.
Validation is provided by the related minor-mode `rng-validate-mode'.
This also makes completion schema- and context- sensitive. Element
names, attribute names, attribute values and namespace URIs can all be
completed. By default, `rng-validate-mode' is automatically enabled.
You can toggle it using \\[rng-validate-mode] or change the default by
customizing `rng-nxml-auto-validate-flag'.
\\[indent-for-tab-command] indents the current line appropriately.
This can be customized using the variable `nxml-child-indent'
and the variable `nxml-attribute-indent'.
\\[nxml-insert-named-char] inserts a character reference using
the character's name (by default, the Unicode name).
\\[universal-argument] \\[nxml-insert-named-char] inserts the character directly.
The Emacs commands that normally operate on balanced expressions will
operate on XML markup items. Thus \\[forward-sexp] will move forward
across one markup item; \\[backward-sexp] will move backward across
one markup item; \\[kill-sexp] will kill the following markup item;
\\[mark-sexp] will mark the following markup item. By default, the
complete element is treated as a single markup item; to make each tag be
treated as a separate markup item, set the variable `nxml-sexp-element-flag'
to nil. For more details, see the function `nxml-forward-balanced-item'.
\\[nxml-backward-up-element] and \\[nxml-down-element] move up and down the element structure.
Many aspects this mode can be customized using
\\[customize-group] nxml RET."
;; (kill-all-local-variables)
;; If encoding does not allow non-break space character, use reference.
;; FIXME: This duplicates code from sgml-mode, perhaps derive from it?
;; FIXME: Perhaps use if possible (e.g. XHTML)?
(setq-local tildify-space-string
(if (equal (decode-coding-string
(encode-coding-string " " buffer-file-coding-system)
buffer-file-coding-system) " ")
" " " "))
;; FIXME: Use the fact that we're parsing the document already
;; rather than using regex-based filtering.
(setq-local tildify-foreach-region-function
(apply-partially 'tildify-foreach-ignore-environments
'(("") ("<" . ">"))))
(setq-local mode-line-process '((nxml-degraded "/degraded")))
;; We'll determine the fill prefix ourselves
(setq-local adaptive-fill-mode nil)
(setq-local forward-sexp-function #'nxml-forward-balanced-item)
(setq-local indent-line-function #'nxml-indent-line)
(setq-local fill-paragraph-function #'nxml-do-fill-paragraph)
;; Comment support
;; This doesn't seem to work too well;
;; I think we should probably roll our own nxml-comment-dwim function.
(setq-local comment-indent-function #'nxml-indent-line)
(setq-local comment-start "")
(setq-local comment-end-skip "[ \t\r\n]*-->")
(setq-local comment-line-break-function #'nxml-newline-and-indent)
(setq-local comment-quote-nested-function #'nxml-comment-quote-nested)
(setq-local comment-continue "") ; avoid double-hyphens as a padding
(setq-local hs-block-start-regexp "<[^/>]*?")
(setq-local hs-block-end-regexp "[^/>]*[^/]>")
(setq-local hs-c-start-regexp ""))
((eq xmltok-type 'cdata-section)
(nxml-compute-indent-in-delimited-token pos ""))
((eq xmltok-type 'processing-instruction)
(nxml-compute-indent-in-delimited-token pos "" "?>"))
(t
(goto-char pos)
(if (and (= (forward-line -1) 0)
(< xmltok-start (point)))
(back-to-indentation)
(goto-char xmltok-start))
(current-column))))
(defun nxml-compute-indent-in-start-tag (pos)
"Return the indent for a line that starts inside a start-tag.
Also for a line that starts inside an empty element.
POS is the position of the first non-whitespace character of the line.
This expects the xmltok-* variables to be set up as by `xmltok-forward'."
(let ((value-boundary (nxml-attribute-value-boundary pos))
(off 0))
(if value-boundary
;; inside an attribute value
(let ((value-start (car value-boundary)))
(goto-char pos)
(forward-line -1)
(if (< (point) value-start)
(goto-char value-start)
(back-to-indentation)))
;; outside an attribute value
(goto-char pos)
(while (and (= (forward-line -1) 0)
(nxml-attribute-value-boundary (point))))
(cond ((<= (point) xmltok-start)
(goto-char xmltok-start)
(setq off nxml-attribute-indent)
(let ((atts (xmltok-merge-attributes)))
(when atts
(let* ((att (car atts))
(start (xmltok-attribute-name-start att)))
(when (< start pos)
(goto-char start)
(setq off 0))))))
(t
(back-to-indentation))))
(+ (current-column) off)))
(defun nxml-attribute-value-boundary (pos)
"Return a pair (START . END) if POS is inside an attribute value.
Otherwise return nil. START and END are the positions of the start
and end of the attribute value containing POS. This expects the
xmltok-* variables to be set up as by `xmltok-forward'."
(let ((atts (xmltok-merge-attributes))
att value-start value-end value-boundary)
(while atts
(setq att (car atts))
(setq value-start (xmltok-attribute-value-start att))
(setq value-end (xmltok-attribute-value-end att))
(cond ((and value-start (< pos value-start))
(setq atts nil))
((and value-start value-end (<= pos value-end))
(setq value-boundary (cons value-start value-end))
(setq atts nil))
(t (setq atts (cdr atts)))))
value-boundary))
(defun nxml-compute-indent-in-delimited-token (pos open-delim close-delim)
"Return the indent for a line that starts inside a token with delimiters.
OPEN-DELIM and CLOSE-DELIM are strings giving the opening and closing
delimiters. POS is the position of the first non-whitespace character
of the line. This expects the xmltok-* variables to be set up as by
`xmltok-forward'."
(cond ((let ((end (+ pos (length close-delim))))
(and (<= end (point-max))
(string= (buffer-substring-no-properties pos end)
close-delim)))
(goto-char xmltok-start))
((progn
(goto-char pos)
(forward-line -1)
(while (looking-at "^[[:blank:]]*$")
(forward-line -1))
(<= (point) xmltok-start))
(goto-char (+ xmltok-start (length open-delim)))
(when (and (string= open-delim "