emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: url-retrieve-synchronously and coding


From: Lennart Borgman
Subject: Re: url-retrieve-synchronously and coding
Date: Wed, 26 Jan 2011 23:08:17 +0100

On Tue, Jan 25, 2011 at 12:01 PM, Lennart Borgman
<address@hidden> wrote:
> On Tue, Jan 25, 2011 at 11:47 AM, Julien Danjou <address@hidden> wrote:
>> On Mon, Jan 24 2011, Lennart Borgman wrote:
>>
>>> It looks to me like url-insert-file-contents is a code place for
>>> decoding. So I suggest the following:
>>>
>>> 1) Move the decoding from url-insert to url-insert-file-contents.
>>
>> I'd like to be able to use the coding detection code and decoding on
>> already retrieved buffer, so this can be used in
>> url-insert-file-contents, but it must be a autonomous function that I
>> can call myself.
>
> Yes, of course.
>
>
>>> 2) Replace the call to decode-coding-inserted-region in
>>> url-insert-file-contents with something that also takes care of xml
>>> encoding and similar things.


I changed my mind a bit. It looks like it is best to do all the url
related decoding in url-insert since that is where you have the
information about http headers. Below is the new suggestion. (Doc
strings needs some rework.)



(defvar coding-finders
  '(("text/xml" coding-finder-for-xml))
  )

(defun coding-finder-for-xml (src)
  (let* ((buffer (if (bufferp src)
                     src
                   (with-current-buffer (generate-new-buffer
"coding-getter-for-xml")
                     (insert (substring src 0 100))
                     (current-buffer))))
         (here (with-current-buffer buffer (point)))
         (coding-system (with-current-buffer buffer
                          (let* ((enc-pos (progn
                                            (goto-char (point-min))

(xmltok-get-declared-encoding-position)))
                                 (enc-name
                                  (and (consp enc-pos)
                                       (buffer-substring-no-properties
(car enc-pos)

(cdr enc-pos)))))
                            (cond (enc-name
                                   (if (string= (downcase enc-name) "utf-16")
                                       (nxml-choose-utf-16-coding-system)
                                     (nxml-mime-charset-coding-system
enc-name)))
                                  (enc-pos (nxml-choose-utf-coding-system)))))))
    (if (bufferp src)
        (with-current-buffer buffer (goto-char here))
      (kill-buffer buffer))
    coding-system))

(defun url-decode (buffer charset media-type)
  "Decode whole BUFFER using char set CHARSET.
Use MEDIA-TYPE only if CHARSET is nil.  In that case it should be
a http header content type.  Use this to lookup a coding finder
function in `coding-finders' and decode the buffer with the
coding system that function returns.

Return non-nil if the buffer was decoded."
  (with-current-buffer buffer
    (save-restriction
      (widen)
      (if charset
          (let ((data (buffer-substring-no-properties (point-min) (point-max))))
            (delete-region (point-min) (point-max))
            (insert (mm-decode-string data charset))
            t)
        (when media-type
          (let* ((rec (assoc media-type coding-finders))
                 (coding-finder (nth 1 rec))
                 (coding (when coding-finder
                           (funcall coding-finder (current-buffer)))))
            (when coding
              (decode-coding-region (point-min) (point-max) coding)
              t)))))))

(defun url-insert (buffer &optional beg end)
  "Insert the body of a URL object.
BUFFER should be a complete URL buffer as returned by `url-retrieve'.
If the headers specify a coding-system, it is applied to the body
before it is inserted.
Returns a list of the form (SIZE CHARSET), where SIZE is the size in bytes
of the inserted text and CHARSET is the charset that was specified in
the header,
or nil if none was found.
BEG and END can be used to only insert a subpart of the body.
They count bytes from the beginning of the body."
  (let* ((handle (with-current-buffer buffer (mm-dissect-buffer t)))
         (data (with-current-buffer (mm-handle-buffer handle)
                 (if beg
                     (buffer-substring (+ (point-min) beg)
                                       (if end (+ (point-min) end) (point-max)))
                   (buffer-string))))
         (charset (mail-content-type-get (mm-handle-type handle)
                                          'charset))
         ;;(coding (mm-charset-to-coding-system charset))
         (media-type (mm-handle-media-type handle))
         (codbuf (generate-new-buffer "url-insert"))
         decoded)
    (mm-destroy-parts handle)
    (insert
     (with-current-buffer codbuf
       (insert data)
       (url-decode (current-buffer) charset media-type)
       (buffer-substring-no-properties (point-min) (point-max))))
    (kill-buffer codbuf)
    (list (length data) charset)))

;;;###autoload
(defun url-insert-file-contents (url &optional visit beg end replace)
  (let ((buffer (url-retrieve-synchronously url)))
    (if (not buffer)
        (error "Opening input file: No such file or directory, %s" url))
    (if visit (setq buffer-file-name url))
    (save-excursion
      (let* ((start (point))
             (size-decoded (url-insert buffer beg end))
             (size    (nth 0 size-decoded))
             (decoded (nth 1 size-decoded)))
        (kill-buffer buffer)
        (when replace
          (delete-region (point-min) start)
          (delete-region (point) (point-max)))
        (unless decoded
          ;; If the headers don't specify any particular charset, use the
          ;; usual heuristic/rules that we apply to files.
          (decode-coding-inserted-region start (point) url visit beg
end replace))
        (list url size)))))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]